From cf2a3aef774983aba3ddf5e4f8fd38079cf20b0d Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Wed, 9 Aug 2023 05:21:15 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |     0
 cache.json  |     1 +
 favicon.ico |   Bin 0 -> 15086 bytes
 index.css   |   355 +
 index.html  | 73533 ++++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |    39 +
 6 files changed, 73928 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..8a9ee561
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2023-08-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.04853v2","updated":"2023-08-01T17:42:59Z","published":"2023-05-08T17:00:49Z","title":"The Current State of Summarization","summary":"  With the explosive growth of textual information, summarization systems have\nbecome increasingly important. This work aims to concisely indicate the current\nstate of the art in abstractive text summarization. As part of this, we outline\nthe current paradigm shifts towards pre-trained encoder-decoder models and\nlarge autoregressive language models. Additionally, we delve further into the\nchallenges of evaluating summarization systems and the potential of\ninstruction-tuned models for zero-shot summarization. Finally, we provide a\nbrief overview of how summarization systems are currently being integrated into\ncommercial applications.\n","authors":["Fabian Retkowski"],"pdf_url":"https://arxiv.org/pdf/2305.04853v2.pdf","comment":"to be published in \"Beyond Quantity: Research with Subsymbolic AI\"\n  (11/2023)"},{"id":"http://arxiv.org/abs/2308.00683v1","updated":"2023-08-01T17:40:48Z","published":"2023-08-01T17:40:48Z","title":"CodeBPE: Investigating Subtokenization Options for Large Language Model\n  Pretraining on Source Code","summary":"  Recent works have widely adopted large language model pretraining for source\ncode, suggested source code-specific pretraining objectives and investigated\nthe applicability of various Transformer-based language model architectures for\nsource code. This work investigates another important aspect of such models,\nnamely the effect of different subtokenization options, and aims at identifying\nmost effective and length-efficient subtokenizations, taking into account code\nspecifics. We propose subtokenziation that reduces average length by 17%\nwithout downstream performance drop, and show that a carefully chosen\nsubtokenization may improve quality by 0.5-2%, possibly with some length\nincrease.\n","authors":["Nadezhda Chirkova","Sergey Troshin"],"pdf_url":"https://arxiv.org/pdf/2308.00683v1.pdf","comment":"Published at ICLR 2023"},{"id":"http://arxiv.org/abs/2308.00675v1","updated":"2023-08-01T17:21:38Z","published":"2023-08-01T17:21:38Z","title":"Tool Documentation Enables Zero-Shot Tool-Usage with Large Language\n  Models","summary":"  Today, large language models (LLMs) are taught to use new tools by providing\na few demonstrations of the tool's usage. Unfortunately, demonstrations are\nhard to acquire, and can result in undesirable biased usage if the wrong\ndemonstration is chosen. Even in the rare scenario that demonstrations are\nreadily available, there is no principled selection protocol to determine how\nmany and which ones to provide. As tasks grow more complex, the selection\nsearch grows combinatorially and invariably becomes intractable. Our work\nprovides an alternative to demonstrations: tool documentation. We advocate the\nuse of tool documentation, descriptions for the individual tool usage, over\ndemonstrations. We substantiate our claim through three main empirical findings\non 6 tasks across both vision and language modalities. First, on existing\nbenchmarks, zero-shot prompts with only tool documentation are sufficient for\neliciting proper tool usage, achieving performance on par with few-shot\nprompts. Second, on a newly collected realistic tool-use dataset with hundreds\nof available tool APIs, we show that tool documentation is significantly more\nvaluable than demonstrations, with zero-shot documentation significantly\noutperforming few-shot without documentation. Third, we highlight the benefits\nof tool documentations by tackling image generation and video tracking using\njust-released unseen state-of-the-art models as tools. Finally, we highlight\nthe possibility of using tool documentation to automatically enable new\napplications: by using nothing more than the documentation of GroundingDino,\nStable Diffusion, XMem, and SAM, LLMs can re-invent the functionalities of the\njust-released Grounded-SAM and Track Anything models.\n","authors":["Cheng-Yu Hsieh","Si-An Chen","Chun-Liang Li","Yasuhisa Fujii","Alexander Ratner","Chen-Yu Lee","Ranjay Krishna","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2308.00675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02477v2","updated":"2023-08-01T16:50:23Z","published":"2023-07-05T17:50:42Z","title":"Reasoning or Reciting? Exploring the Capabilities and Limitations of\n  Language Models Through Counterfactual Tasks","summary":"  The impressive performance of recent language models across a wide range of\ntasks suggests that they possess a degree of abstract reasoning skills. Are\nthese skills general and transferable, or specialized to specific tasks seen\nduring pretraining? To disentangle these effects, we propose an evaluation\nframework based on \"counterfactual\" task variants that deviate from the default\nassumptions underlying standard tasks. Across a suite of 11 tasks, we observe\nnontrivial performance on the counterfactual variants, but nevertheless find\nthat performance substantially and consistently degrades compared to the\ndefault conditions. This suggests that while current LMs may possess abstract\ntask-solving skills to a degree, they often also rely on narrow,\nnon-transferable procedures for task-solving. These results motivate a more\ncareful interpretation of language model performance that teases apart these\naspects of behavior.\n","authors":["Zhaofeng Wu","Linlu Qiu","Alexis Ross","Ekin Akyürek","Boyuan Chen","Bailin Wang","Najoung Kim","Jacob Andreas","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2307.02477v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10947v3","updated":"2023-08-01T16:48:47Z","published":"2022-12-21T11:38:51Z","title":"Parallel Context Windows for Large Language Models","summary":"  When applied to processing long text, Large Language Models (LLMs) are\nlimited by their context window. Existing efforts to address this limitation\ninvolve training specialized architectures, and cannot be easily applied to\noff-the-shelf LLMs. We present Parallel Context Windows (PCW), a method that\nalleviates the context window restriction for any off-the-shelf LLM without\nfurther training. The key to the approach is to carve a long context into\nchunks (``windows''), restrict the attention mechanism to apply only within\neach window, and re-use the positional embeddings across the windows. Our main\nresults test the PCW approach on in-context learning with models that range in\nsize between 750 million and 178 billion parameters, and show substantial\nimprovements for tasks with diverse input and output spaces. We show additional\nbenefits in other settings where long context windows may be beneficial:\nmulti-hop questions and retrieval-augmented question answering with multiple\nretrieved documents. Our results highlight Parallel Context Windows as a\npromising method for applying off-the-shelf LLMs in a range of settings that\nrequire long text sequences. We make our code publicly available at\nhttps://github.com/ai21labs/parallel-context-windows.\n","authors":["Nir Ratner","Yoav Levine","Yonatan Belinkov","Ori Ram","Inbal Magar","Omri Abend","Ehud Karpas","Amnon Shashua","Kevin Leyton-Brown","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2212.10947v3.pdf","comment":"The 61st Annual Meeting of the Association for Computational\n  Linguistics (ACL 2023)"},{"id":"http://arxiv.org/abs/2307.15411v2","updated":"2023-08-01T16:04:09Z","published":"2023-07-28T09:03:19Z","title":"Investigating the Learning Behaviour of In-context Learning: A\n  Comparison with Supervised Learning","summary":"  Large language models (LLMs) have shown remarkable capacity for in-context\nlearning (ICL), where learning a new task from just a few training examples is\ndone without being explicitly pre-trained. However, despite the success of\nLLMs, there has been little understanding of how ICL learns the knowledge from\nthe given prompts. In this paper, to make progress toward understanding the\nlearning behaviour of ICL, we train the same LLMs with the same demonstration\nexamples via ICL and supervised learning (SL), respectively, and investigate\ntheir performance under label perturbations (i.e., noisy labels and label\nimbalance) on a range of classification tasks. First, via extensive\nexperiments, we find that gold labels have significant impacts on the\ndownstream in-context performance, especially for large language models;\nhowever, imbalanced labels matter little to ICL across all model sizes. Second,\nwhen comparing with SL, we show empirically that ICL is less sensitive to label\nperturbations than SL, and ICL gradually attains comparable performance to SL\nas the model size increases.\n","authors":["Xindi Wang","Yufei Wang","Can Xu","Xiubo Geng","Bowen Zhang","Chongyang Tao","Frank Rudzicz","Robert E. Mercer","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.15411v2.pdf","comment":"accepted to ECAI 2023 (camera-ready)"},{"id":"http://arxiv.org/abs/2308.00624v1","updated":"2023-08-01T15:51:41Z","published":"2023-08-01T15:51:41Z","title":"JIANG: Chinese Open Foundation Language Model","summary":"  With the advancements in large language model technology, it has showcased\ncapabilities that come close to those of human beings across various tasks.\nThis achievement has garnered significant interest from companies and\nscientific research institutions, leading to substantial investments in the\nresearch and development of these models. While numerous large models have\nemerged during this period, the majority of them have been trained primarily on\nEnglish data. Although they exhibit decent performance in other languages, such\nas Chinese, their potential remains limited due to factors like vocabulary\ndesign and training corpus. Consequently, their ability to fully express their\ncapabilities in Chinese falls short. To address this issue, we introduce the\nmodel named JIANG (Chinese pinyin of ginger) specifically designed for the\nChinese language. We have gathered a substantial amount of Chinese corpus to\ntrain the model and have also optimized its structure. The extensive\nexperimental results demonstrate the excellent performance of our model.\n","authors":["Qinhua Duan","Wenchao Gu","Yujia Chen","Wenxin Mao","Zewen Tian","Hui Cao"],"pdf_url":"https://arxiv.org/pdf/2308.00624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.08195v3","updated":"2023-08-01T15:19:55Z","published":"2022-08-17T10:03:18Z","title":"Benchmarking Compositionality with Formal Languages","summary":"  Recombining known primitive concepts into larger novel combinations is a\nquintessentially human cognitive capability. Whether large neural models in NLP\ncan acquire this ability while learning from data is an open question. In this\npaper, we investigate this problem from the perspective of formal languages. We\nuse deterministic finite-state transducers to make an unbounded number of\ndatasets with controllable properties governing compositionality. By randomly\nsampling over many transducers, we explore which of their properties contribute\nto learnability of a compositional relation by a neural network. We find that\nthe models either learn the relations completely or not at all. The key is\ntransition coverage, setting a soft learnability limit at 400 examples per\ntransition.\n","authors":["Josef Valvoda","Naomi Saphra","Jonathan Rawski","Adina Williams","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2208.08195v3.pdf","comment":"Published at COLING 2022. This version fixes a mistake in Figure 4\n  and adds a clarifying note in teal. Code is available at\n  https://github.com/valvoda/neuralTransducer"},{"id":"http://arxiv.org/abs/2303.09901v3","updated":"2023-08-01T15:16:52Z","published":"2023-03-17T11:33:06Z","title":"mCPT at SemEval-2023 Task 3: Multilingual Label-Aware Contrastive\n  Pre-Training of Transformers for Few- and Zero-shot Framing Detection","summary":"  This paper presents the winning system for the zero-shot Spanish framing\ndetection task, which also achieves competitive places in eight additional\nlanguages. The challenge of the framing detection task lies in identifying a\nset of 14 frames when only a few or zero samples are available, i.e., a\nmultilingual multi-label few- or zero-shot setting. Our developed solution\nemploys a pre-training procedure based on multilingual Transformers using a\nlabel-aware contrastive loss function. In addition to describing the system, we\nperform an embedding space analysis and ablation study to demonstrate how our\npre-training procedure supports framing detection to advance computational\nframing analysis.\n","authors":["Markus Reiter-Haas","Alexander Ertl","Kevin Innerebner","Elisabeth Lex"],"pdf_url":"https://arxiv.org/pdf/2303.09901v3.pdf","comment":"Presented at SemEval'23"},{"id":"http://arxiv.org/abs/2307.04192v3","updated":"2023-08-01T15:05:01Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v3.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2303.05389v2","updated":"2023-08-01T14:48:39Z","published":"2023-03-06T20:08:07Z","title":"Depression Detection Using Digital Traces on Social Media: A\n  Knowledge-aware Deep Learning Approach","summary":"  Depression is a common disease worldwide. It is difficult to diagnose and\ncontinues to be underdiagnosed. Because depressed patients constantly share\ntheir symptoms, major life events, and treatments on social media, researchers\nare turning to user-generated digital traces on social media for depression\ndetection. Such methods have distinct advantages in combating depression\nbecause they can facilitate innovative approaches to fight depression and\nalleviate its social and economic burden. However, most existing studies lack\neffective means to incorporate established medical domain knowledge in\ndepression detection or suffer from feature extraction difficulties that impede\ngreater performance. Following the design science research paradigm, we propose\na Deep Knowledge-aware Depression Detection (DKDD) framework to accurately\ndetect social media users at risk of depression and explain the critical\nfactors that contribute to such detection. Extensive empirical studies with\nreal-world data demonstrate that, by incorporating domain knowledge, our method\noutperforms existing state-of-the-art methods. Our work has significant\nimplications for IS research in knowledge-aware machine learning, digital\ntraces utilization, and NLP research in IS. Practically, by providing early\ndetection and explaining the critical factors, DKDD can supplement clinical\ndepression screening and enable large-scale evaluations of a population's\nmental health status.\n","authors":["Wenli Zhang","Jiaheng Xie","Zhu Zhang","Xiang Liu"],"pdf_url":"https://arxiv.org/pdf/2303.05389v2.pdf","comment":"Presented at INFORMS 2022 Data Science Workshop"},{"id":"http://arxiv.org/abs/2307.09009v2","updated":"2023-08-01T14:23:58Z","published":"2023-07-18T06:56:08Z","title":"How is ChatGPT's behavior changing over time?","summary":"  GPT-3.5 and GPT-4 are the two most widely used large language model (LLM)\nservices. However, when and how these models are updated over time is opaque.\nHere, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on\nseveral diverse tasks: 1) math problems, 2) sensitive/dangerous questions, 3)\nopinion surveys, 4) multi-hop knowledge-intensive questions, 5) generating\ncode, 6) US Medical License tests, and 7) visual reasoning. We find that the\nperformance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time.\nFor example, GPT-4 (March 2023) was reasonable at identifying prime vs.\ncomposite numbers (84% accuracy) but GPT-4 (June 2023) was poor on these same\nquestions (51% accuracy). This is partly explained by a drop in GPT-4's amenity\nto follow chain-of-thought prompting. Interestingly, GPT-3.5 was much better in\nJune than in March in this task. GPT-4 became less willing to answer sensitive\nquestions and opinion survey questions in June than in March. GPT-4 performed\nbetter at multi-hop questions in June than in March, while GPT-3.5's\nperformance dropped on this task. Both GPT-4 and GPT-3.5 had more formatting\nmistakes in code generation in June than in March. Overall, our findings show\nthat the behavior of the \"same\" LLM service can change substantially in a\nrelatively short amount of time, highlighting the need for continuous\nmonitoring of LLMs.\n","authors":["Lingjiao Chen","Matei Zaharia","James Zou"],"pdf_url":"https://arxiv.org/pdf/2307.09009v2.pdf","comment":"add more evaluations"},{"id":"http://arxiv.org/abs/2307.11224v2","updated":"2023-08-01T13:40:31Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n  Models","summary":"  Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. The models\nexcel in applications such as dense retrieval and semantic textual similarity.\nThis paper details the development of Jina Embeddings, starting with the\ncreation of high-quality pairwise and triplet datasets. It underlines the\ncrucial role of data cleaning in dataset preparation, gives in-depth insights\ninto the model training process, and concludes with a comprehensive performance\nevaluation using the Massive Textual Embedding Benchmark (MTEB). To increase\nthe model's awareness of negations, we constructed a novel training and\nevaluation dataset of negated and non-negated statements, which we make\npublicly available to the community.\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v2.pdf","comment":"9 pages, 2 page appendix"},{"id":"http://arxiv.org/abs/2308.00528v1","updated":"2023-08-01T13:14:10Z","published":"2023-08-01T13:14:10Z","title":"Unimodal Intermediate Training for Multimodal Meme Sentiment\n  Classification","summary":"  Internet Memes remain a challenging form of user-generated content for\nautomated sentiment classification. The availability of labelled memes is a\nbarrier to developing sentiment classifiers of multimodal memes. To address the\nshortage of labelled memes, we propose to supplement the training of a\nmultimodal meme classifier with unimodal (image-only and text-only) data. In\nthis work, we present a novel variant of supervised intermediate training that\nuses relatively abundant sentiment-labelled unimodal data. Our results show a\nstatistically significant performance improvement from the incorporation of\nunimodal text data. Furthermore, we show that the training set of labelled\nmemes can be reduced by 40% without reducing the performance of the downstream\nmodel.\n","authors":["Muzhaffar Hazman","Susan McKeever","Josephine Griffith"],"pdf_url":"https://arxiv.org/pdf/2308.00528v1.pdf","comment":"Accepted for Publication at RANLP2023"},{"id":"http://arxiv.org/abs/2302.00083v3","updated":"2023-08-01T12:10:15Z","published":"2023-01-31T20:26:16Z","title":"In-Context Retrieval-Augmented Language Models","summary":"  Retrieval-Augmented Language Modeling (RALM) methods, which condition a\nlanguage model (LM) on relevant documents from a grounding corpus during\ngeneration, were shown to significantly improve language modeling performance.\nIn addition, they can mitigate the problem of factually inaccurate text\ngeneration and provide natural source attribution mechanism. Existing RALM\napproaches focus on modifying the LM architecture in order to facilitate the\nincorporation of external information, significantly complicating deployment.\nThis paper considers a simple alternative, which we dub In-Context RALM:\nleaving the LM architecture unchanged and prepending grounding documents to the\ninput, without any further training of the LM. We show that In-Context RALM\nthat builds on off-the-shelf general purpose retrievers provides surprisingly\nlarge LM gains across model sizes and diverse corpora. We also demonstrate that\nthe document retrieval and ranking mechanism can be specialized to the RALM\nsetting to further boost performance. We conclude that In-Context RALM has\nconsiderable potential to increase the prevalence of LM grounding, particularly\nin settings where a pretrained LM must be used without modification or even via\nAPI access.\n","authors":["Ori Ram","Yoav Levine","Itay Dalmedigos","Dor Muhlgay","Amnon Shashua","Kevin Leyton-Brown","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2302.00083v3.pdf","comment":"Accepted for publication in Transactions of the Association for\n  Computational Linguistics (TACL). pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2308.00479v1","updated":"2023-08-01T12:04:50Z","published":"2023-08-01T12:04:50Z","title":"Retrieval Augmented Generation and Representative Vector Summarization\n  for large unstructured textual data in Medical Education","summary":"  Large Language Models are increasingly being used for various tasks including\ncontent generation and as chatbots. Despite their impressive performances in\ngeneral tasks, LLMs need to be aligned when applying for domain specific tasks\nto mitigate the problems of hallucination and producing harmful answers.\nRetrieval Augmented Generation (RAG) allows to easily attach and manipulate a\nnon-parametric knowledgebases to LLMs. Applications of RAG in the field of\nmedical education are discussed in this paper. A combined extractive and\nabstractive summarization method for large unstructured textual data using\nrepresentative vectors is proposed.\n","authors":["S. S. Manathunga","Y. A. Illangasekara"],"pdf_url":"https://arxiv.org/pdf/2308.00479v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.00447v1","updated":"2023-08-01T10:46:09Z","published":"2023-08-01T10:46:09Z","title":"Structural Embeddings of Tools for Large Language Models","summary":"  It is evident that the current state of Large Language Models (LLMs)\nnecessitates the incorporation of external tools. The lack of straightforward\nalgebraic and logical reasoning is well documented and prompted researchers to\ndevelop frameworks which allow LLMs to operate via external tools. The\nontological nature of tool utilization for a specific task can be well\nformulated with a Directed Acyclic Graph (DAG). The central aim of the paper is\nto highlight the importance of graph based approaches to LLM-tool interaction\nin near future. We propose an exemplary framework to guide the orchestration of\nexponentially increasing numbers of external tools with LLMs,where objectives\nand functionalities of tools are graph encoded hierarchically. Assuming that\ntextual segments of a Chain-of-Thought (CoT) can be imagined as a tool as\ndefined here, the graph based framework can pave new avenues in that particular\ndirection as well.\n","authors":["Eren Unlu"],"pdf_url":"https://arxiv.org/pdf/2308.00447v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.17020v2","updated":"2023-08-01T10:44:36Z","published":"2023-05-26T15:26:12Z","title":"Diable: Efficient Dialogue State Tracking as Operations on Tables","summary":"  Sequence-to-sequence state-of-the-art systems for dialogue state tracking\n(DST) use the full dialogue history as input, represent the current state as a\nlist with all the slots, and generate the entire state from scratch at each\ndialogue turn. This approach is inefficient, especially when the number of\nslots is large and the conversation is long. We propose Diable, a new task\nformalisation that simplifies the design and implementation of efficient DST\nsystems and allows one to easily plug and play large language models. We\nrepresent the dialogue state as a table and formalise DST as a table\nmanipulation task. At each turn, the system updates the previous state by\ngenerating table operations based on the dialogue context. Extensive\nexperimentation on the MultiWoz datasets demonstrates that Diable (i)\noutperforms strong efficient DST baselines, (ii) is 2.4x more time efficient\nthan current state-of-the-art methods while retaining competitive Joint Goal\nAccuracy, and (iii) is robust to noisy data annotations due to the table\noperations approach.\n","authors":["Pietro Lesci","Yoshinari Fujinuma","Momchil Hardalov","Chao Shang","Lluis Marquez"],"pdf_url":"https://arxiv.org/pdf/2305.17020v2.pdf","comment":"Accepted to ACL 2023 (Findings)"},{"id":"http://arxiv.org/abs/2308.00436v1","updated":"2023-08-01T10:31:36Z","published":"2023-08-01T10:31:36Z","title":"SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step\n  Reasoning","summary":"  The recent progress in large language models (LLMs), especially the invention\nof chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning\nproblems. However, even the strongest LLMs are still struggling with more\ncomplicated problems that require non-linear thinking and multi-step reasoning.\nIn this work, we explore whether LLMs have the ability to recognize their own\nerrors, without resorting to external resources. In particular, we investigate\nwhether they can be used to identify individual errors within a step-by-step\nreasoning. To this end, we propose a zero-shot verification scheme to recognize\nsuch errors. We then use this verification scheme to improve question-answering\nperformance, by using it to perform weighted voting on different generated\nanswers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and\nfind that it successfully recognizes errors and, in turn, increases final\npredictive performance.\n","authors":["Ning Miao","Yee Whye Teh","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2308.00436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08698v2","updated":"2023-08-01T10:23:20Z","published":"2023-05-15T14:58:28Z","title":"Continual Multimodal Knowledge Graph Construction","summary":"  Multimodal Knowledge Graph Construction (MKGC) involves creating structured\nrepresentations of entities and relations using multiple modalities, such as\ntext and images. However, existing MKGC models face challenges in handling the\naddition of new entities and relations in dynamic real-world scenarios. The\ncurrent continual setting for knowledge graph construction mainly focuses on\nentity and relation extraction from text data, overlooking other multimodal\nsources. Therefore, there arises the need to explore the challenge of continual\nMKGC to address the phenomenon of catastrophic forgetting and ensure the\nretention of past knowledge extracted from different forms of data. This\nresearch focuses on investigating this complex topic by developing lifelong\nMKGC benchmark datasets. Based on the empirical findings that several typical\nMKGC models, when trained on multimedia data, might unexpectedly underperform\ncompared to those solely utilizing textual resources in a continual setting, we\npropose a Lifelong MultiModal Consistent Transformer Framework (LMC) for\ncontinual MKGC, which plays the strengths of the consistent multimodal\noptimization in continual learning and leads to a better stability-plasticity\ntrade-off. Our experiments demonstrate the superior performance of our method\nover prevailing continual learning techniques or multimodal approaches in\ndynamic scenarios. Code and datasets can be found at\nhttps://github.com/zjunlp/ContinueMKGC.\n","authors":["Xiang Chen","Ningyu Zhang","Jintian Zhang","Xiaohan Wang","Tongtong Wu","Xi Chen","Yongheng Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2305.08698v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2303.08268v2","updated":"2023-08-01T10:22:21Z","published":"2023-03-14T23:01:27Z","title":"Chat with the Environment: Interactive Multimodal Perception Using Large\n  Language Models","summary":"  Programming robot behavior in a complex world faces challenges on multiple\nlevels, from dextrous low-level skills to high-level planning and reasoning.\nRecent pre-trained Large Language Models (LLMs) have shown remarkable reasoning\nability in few-shot robotic planning. However, it remains challenging to ground\nLLMs in multimodal sensory input and continuous action output, while enabling a\nrobot to interact with its environment and acquire novel information as its\npolicies unfold. We develop a robot interaction scenario with a partially\nobservable state, which necessitates a robot to decide on a range of epistemic\nactions in order to sample sensory information among multiple modalities,\nbefore being able to execute the task correctly. An interactive perception\nframework is therefore proposed with an LLM as its backbone, whose ability is\nexploited to instruct epistemic actions and to reason over the resulting\nmultimodal sensations (vision, sound, haptics, proprioception), as well as to\nplan an entire task execution based on the interactively acquired information.\nOur study demonstrates that LLMs can provide high-level planning and reasoning\nskills and control interactive robot behavior in a multimodal environment,\nwhile multimodal modules with the context of the environmental state help\nground the LLMs and extend their processing ability. The project website can be\nfound at\n\\href{https://matcha-model.github.io}{\\textcolor{blue}{https://matcha-model.github.io/}}.\n","authors":["Xufeng Zhao","Mengdi Li","Cornelius Weber","Muhammad Burhan Hafez","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2303.08268v2.pdf","comment":"Accepted at IROS2023, Detroit. See the project website at\n  https://matcha-model.github.io"},{"id":"http://arxiv.org/abs/2307.16573v2","updated":"2023-08-01T10:17:28Z","published":"2023-07-31T11:06:08Z","title":"Deep Dive into the Language of International Relations: NLP-based\n  Analysis of UNESCO's Summary Records","summary":"  Cultural heritage is an arena of international relations that interests all\nstates worldwide. The inscription process on the UNESCO World Heritage List and\nthe UNESCO Representative List of the Intangible Cultural Heritage of Humanity\noften leads to tensions and conflicts among states. This research addresses\nthese challenges by developing automatic tools that provide valuable insights\ninto the decision-making processes regarding inscriptions to the two lists\nmentioned above. We propose innovative topic modelling and tension detection\nmethods based on UNESCO's summary records. Our analysis achieved a commendable\naccuracy rate of 72% in identifying tensions. Furthermore, we have developed an\napplication tailored for diplomats, lawyers, political scientists, and\ninternational relations researchers that facilitates the efficient search of\nparagraphs from selected documents and statements from specific speakers about\nchosen topics. This application is a valuable resource for enhancing the\nunderstanding of complex decision-making dynamics within international heritage\ninscription procedures.\n","authors":["Joanna Wojciechowska","Mateusz Sypniewski","Maria Śmigielska","Igor Kamiński","Emilia Wiśnios","Hanna Schreiber","Bartosz Pieliński"],"pdf_url":"https://arxiv.org/pdf/2307.16573v2.pdf","comment":"Accepted for 3rd Workshop on Computational Linguistics for the\n  Political and Social Sciences at KONVENS 2023 Conference"},{"id":"http://arxiv.org/abs/2308.00425v1","updated":"2023-08-01T10:10:59Z","published":"2023-08-01T10:10:59Z","title":"Discourse-Aware Text Simplification: From Complex Sentences to Linked\n  Propositions","summary":"  Sentences that present a complex syntax act as a major stumbling block for\ndownstream Natural Language Processing applications whose predictive quality\ndeteriorates with sentence length and complexity. The task of Text\nSimplification (TS) may remedy this situation. It aims to modify sentences in\norder to make them easier to process, using a set of rewriting operations, such\nas reordering, deletion, or splitting. State-of-the-art syntactic TS approaches\nsuffer from two major drawbacks: first, they follow a very conservative\napproach in that they tend to retain the input rather than transforming it, and\nsecond, they ignore the cohesive nature of texts, where context spread across\nclauses or sentences is needed to infer the true meaning of a statement. To\naddress these problems, we present a discourse-aware TS approach that splits\nand rephrases complex English sentences within the semantic context in which\nthey occur. Based on a linguistically grounded transformation stage that uses\nclausal and phrasal disembedding mechanisms, complex sentences are transformed\ninto shorter utterances with a simple canonical structure that can be easily\nanalyzed by downstream applications. With sentence splitting, we thus address a\nTS task that has hardly been explored so far. Moreover, we introduce the notion\nof minimality in this context, as we aim to decompose source sentences into a\nset of self-contained minimal semantic units. To avoid breaking down the input\ninto a disjointed sequence of statements that is difficult to interpret because\nimportant contextual information is missing, we incorporate the semantic\ncontext between the split propositions in the form of hierarchical structures\nand semantic relationships. In that way, we generate a semantic hierarchy of\nminimal propositions that leads to a novel representation of complex assertions\nthat puts a semantic layer on top of the simplified sentences.\n","authors":["Christina Niklaus","Matthias Cetto","André Freitas","Siegfried Handschuh"],"pdf_url":"https://arxiv.org/pdf/2308.00425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15002v3","updated":"2023-08-01T10:10:50Z","published":"2023-07-27T16:57:32Z","title":"Gzip versus bag-of-words for text classification with KNN","summary":"  The effectiveness of compression distance in KNN-based text classification\n('gzip') has recently garnered lots of attention. In this note we show that\nsimpler means can also be effective, and compression may not be needed. Indeed,\na 'bag-of-words' matching can achieve similar or better results, and is more\nefficient.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2307.15002v3.pdf","comment":"improved figure display, added more results, fixed typos"},{"id":"http://arxiv.org/abs/2302.12189v2","updated":"2023-08-01T09:53:21Z","published":"2023-02-23T17:30:18Z","title":"HL Dataset: Visually-grounded Description of Scenes, Actions and\n  Rationales","summary":"  Current captioning datasets focus on object-centric captions, describing the\nvisible objects in the image, e.g. \"people eating food in a park\". Although\nthese datasets are useful to evaluate the ability of Vision & Language models\nto recognize and describe visual content, they do not support controlled\nexperiments involving model testing or fine-tuning, with more high-level\ncaptions, which humans find easy and natural to produce. For example, people\noften describe images based on the type of scene they depict ('people at a\nholiday resort') and the actions they perform ('people having a picnic'). Such\ndescriptions draw on personal experience and commonsense assumptions. We\npresent the High-Level Dataset a dataset extending 14997 images from the COCO\ndataset, aligned with a new set of 134,973 human-annotated (high-level)\ncaptions collected along three axes: scenes, actions, and rationales. We\nfurther extend this dataset with confidence scores collected from an\nindependent set of readers, as well as a set of narrative captions generated\nsynthetically, by combining each of the three axes. We describe this dataset\nand analyse it extensively. We also present baseline results for the High-Level\nCaptioning task.\n","authors":["Michele Cafagna","Kees van Deemter","Albert Gatt"],"pdf_url":"https://arxiv.org/pdf/2302.12189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00400v1","updated":"2023-08-01T09:28:36Z","published":"2023-08-01T09:28:36Z","title":"ZRIGF: An Innovative Multimodal Framework for Zero-Resource\n  Image-Grounded Dialogue Generation","summary":"  Image-grounded dialogue systems benefit greatly from integrating visual\ninformation, resulting in high-quality response generation. However, current\nmodels struggle to effectively utilize such information in zero-resource\nscenarios, mainly due to the disparity between image and text modalities. To\novercome this challenge, we propose an innovative multimodal framework, called\nZRIGF, which assimilates image-grounded information for dialogue generation in\nzero-resource situations. ZRIGF implements a two-stage learning strategy,\ncomprising contrastive pre-training and generative pre-training. Contrastive\npre-training includes a text-image matching module that maps images and texts\ninto a unified encoded vector space, along with a text-assisted masked image\nmodeling module that preserves pre-training visual features and fosters further\nmultimodal feature alignment. Generative pre-training employs a multimodal\nfusion module and an information transfer module to produce insightful\nresponses based on harmonized multimodal representations. Comprehensive\nexperiments conducted on both text-based and image-grounded dialogue datasets\ndemonstrate ZRIGF's efficacy in generating contextually pertinent and\ninformative responses. Furthermore, we adopt a fully zero-resource scenario in\nthe image-grounded dialogue dataset to demonstrate our framework's robust\ngeneralization capabilities in novel domains. The code is available at\nhttps://github.com/zhangbo-nlp/ZRIGF.\n","authors":["Bo Zhang","Jian Wang","Hui Ma","Bo Xu","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.00400v1.pdf","comment":"ACM Multimedia 2023 Accpeted, Repo:\n  https://github.com/zhangbo-nlp/ZRIGF"},{"id":"http://arxiv.org/abs/2308.00399v1","updated":"2023-08-01T09:26:40Z","published":"2023-08-01T09:26:40Z","title":"Tackling Hallucinations in Neural Chart Summarization","summary":"  Hallucinations in text generation occur when the system produces text that is\nnot grounded in the input. In this work, we tackle the problem of\nhallucinations in neural chart summarization. Our analysis shows that the\ntarget side of chart summarization training datasets often contains additional\ninformation, leading to hallucinations. We propose a natural language inference\n(NLI) based method to preprocess the training data and show through human\nevaluation that our method significantly reduces hallucinations. We also found\nthat shortening long-distance dependencies in the input sequence and adding\nchart-related information like title and legends improves the overall\nperformance.\n","authors":["Saad Obaid ul Islam","Iza Škrjanec","Ondřej Dušek","Vera Demberg"],"pdf_url":"https://arxiv.org/pdf/2308.00399v1.pdf","comment":"To be presented in INLG 2023"},{"id":"http://arxiv.org/abs/2304.11082v3","updated":"2023-08-01T09:18:03Z","published":"2023-04-19T17:50:09Z","title":"Fundamental Limitations of Alignment in Large Language Models","summary":"  An important aspect in developing language models that interact with humans\nis aligning their behavior to be useful and unharmful for their human users.\nThis is usually achieved by tuning the model in a way that enhances desired\nbehaviors and inhibits undesired ones, a process referred to as alignment. In\nthis paper, we propose a theoretical approach called Behavior Expectation\nBounds (BEB) which allows us to formally investigate several inherent\ncharacteristics and limitations of alignment in large language models.\nImportantly, we prove that for any behavior that has a finite probability of\nbeing exhibited by the model, there exist prompts that can trigger the model\ninto outputting this behavior, with probability that increases with the length\nof the prompt. This implies that any alignment process that attenuates\nundesired behavior but does not remove it altogether, is not safe against\nadversarial prompting attacks. Furthermore, our framework hints at the\nmechanism by which leading alignment approaches such as reinforcement learning\nfrom human feedback increase the LLM's proneness to being prompted into the\nundesired behaviors. Moreover, we include the notion of personas in our BEB\nframework, and find that behaviors which are generally very unlikely to be\nexhibited by the model can be brought to the front by prompting the model to\nbehave as specific persona. This theoretical result is being experimentally\ndemonstrated in large scale by the so called contemporary \"chatGPT jailbreaks\",\nwhere adversarial users trick the LLM into breaking its alignment guardrails by\ntriggering it into acting as a malicious persona. Our results expose\nfundamental limitations in alignment of LLMs and bring to the forefront the\nneed to devise reliable mechanisms for ensuring AI safety.\n","authors":["Yotam Wolf","Noam Wies","Oshri Avnery","Yoav Levine","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2304.11082v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17446v2","updated":"2023-08-01T08:54:06Z","published":"2023-05-27T11:16:26Z","title":"Fine-tuning Happens in Tiny Subspaces: Exploring Intrinsic Task-specific\n  Subspaces of Pre-trained Language Models","summary":"  Pre-trained language models (PLMs) are known to be overly parameterized and\nhave significant redundancy, indicating a small degree of freedom of the PLMs.\nMotivated by the observation, in this paper, we study the problem of\nre-parameterizing and fine-tuning PLMs from a new perspective: Discovery of\nintrinsic task-specific subspace. Specifically, by exploiting the dynamics of\nthe fine-tuning process for a given task, the parameter optimization trajectory\nis learned to uncover its intrinsic task-specific subspace. A key finding is\nthat PLMs can be effectively fine-tuned in the subspace with a small number of\nfree parameters. Beyond, we observe some outlier dimensions emerging during\nfine-tuning in the subspace. Disabling these dimensions degrades the model\nperformance significantly. This suggests that these dimensions are crucial to\ninduce task-specific knowledge to downstream tasks.\n","authors":["Zhong Zhang","Bang Liu","Junming Shao"],"pdf_url":"https://arxiv.org/pdf/2305.17446v2.pdf","comment":"ACL 2023 (main conference, long paper)"},{"id":"http://arxiv.org/abs/2308.00364v1","updated":"2023-08-01T08:12:43Z","published":"2023-08-01T08:12:43Z","title":"Fountain -- an intelligent contextual assistant combining knowledge\n  representation and language models for manufacturing risk identification","summary":"  Deviations from the approved design or processes during mass production can\nlead to unforeseen risks. However, these changes are sometimes necessary due to\nchanges in the product design characteristics or an adaptation in the\nmanufacturing process. A major challenge is to identify these risks early in\nthe workflow so that failures leading to warranty claims can be avoided. We\ndeveloped Fountain as a contextual assistant integrated in the deviation\nmanagement workflow that helps in identifying the risks based on the\ndescription of the existing design and process criteria and the proposed\ndeviation. In the manufacturing context, it is important that the assistant\nprovides recommendations that are explainable and consistent. We achieve this\nthrough a combination of the following two components 1) language models\nfinetuned for domain specific semantic similarity and, 2) knowledge\nrepresentation in the form of a property graph derived from the bill of\nmaterials, Failure Modes and Effect Analysis (FMEA) and prior failures reported\nby customers. Here, we present the nuances of selecting and adapting pretrained\nlanguage models for an engineering domain, continuous model updates based on\nuser interaction with the contextual assistant and creating the causal chain\nfor explainable recommendations based on the knowledge representation.\nAdditionally, we demonstrate that the model adaptation is feasible using\nmoderate computational infrastructure already available to most engineering\nteams in manufacturing organizations and inference can be performed on standard\nCPU only instances for integration with existing applications making these\nmethods easily deployable.\n","authors":["Saurabh Kumar","Daniel Fuchs","Klaus Spindler"],"pdf_url":"https://arxiv.org/pdf/2308.00364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06607v2","updated":"2023-08-01T07:50:19Z","published":"2022-11-12T08:10:35Z","title":"Few-shot Multimodal Sentiment Analysis based on Multimodal Probabilistic\n  Fusion Prompts","summary":"  Multimodal sentiment analysis has gained significant attention due to the\nproliferation of multimodal content on social media. However, existing studies\nin this area rely heavily on large-scale supervised data, which is\ntime-consuming and labor-intensive to collect. Thus, there is a need to address\nthe challenge of few-shot multimodal sentiment analysis. To tackle this\nproblem, we propose a novel method called Multimodal Probabilistic Fusion\nPrompts (MultiPoint) that leverages diverse cues from different modalities for\nmultimodal sentiment detection in the few-shot scenario. Specifically, we start\nby introducing a Consistently Distributed Sampling approach called CDS, which\nensures that the few-shot dataset has the same category distribution as the\nfull dataset. Unlike previous approaches primarily using prompts based on the\ntext modality, we design unified multimodal prompts to reduce discrepancies\nbetween different modalities and dynamically incorporate multimodal\ndemonstrations into the context of each multimodal instance. To enhance the\nmodel's robustness, we introduce a probabilistic fusion method to fuse output\npredictions from multiple diverse prompts for each input. Our extensive\nexperiments on six datasets demonstrate the effectiveness of our approach.\nFirst, our method outperforms strong baselines in the multimodal few-shot\nsetting. Furthermore, under the same amount of data (1% of the full dataset),\nour CDS-based experimental results significantly outperform those based on\npreviously sampled datasets constructed from the same number of instances of\neach class.\n","authors":["Xiaocui Yang","Shi Feng","Daling Wang","Pengfei Hong","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2211.06607v2.pdf","comment":"9 pages, 2 figures, 7 tables. It has been accepted ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.00319v1","updated":"2023-08-01T06:30:37Z","published":"2023-08-01T06:30:37Z","title":"LimeAttack: Local Explainable Method for Textual Hard-Label Adversarial\n  Attack","summary":"  Natural language processing models are vulnerable to adversarial examples.\nPrevious textual adversarial attacks adopt gradients or confidence scores to\ncalculate word importance ranking and generate adversarial examples. However,\nthis information is unavailable in the real world. Therefore, we focus on a\nmore realistic and challenging setting, named hard-label attack, in which the\nattacker can only query the model and obtain a discrete prediction label.\nExisting hard-label attack algorithms tend to initialize adversarial examples\nby random substitution and then utilize complex heuristic algorithms to\noptimize the adversarial perturbation. These methods require a lot of model\nqueries and the attack success rate is restricted by adversary initialization.\nIn this paper, we propose a novel hard-label attack algorithm named LimeAttack,\nwhich leverages a local explainable method to approximate word importance\nranking, and then adopts beam search to find the optimal solution. Extensive\nexperiments show that LimeAttack achieves the better attacking performance\ncompared with existing hard-label attack under the same query budget. In\naddition, we evaluate the effectiveness of LimeAttack on large language models,\nand results indicate that adversarial examples remain a significant threat to\nlarge language models. The adversarial examples crafted by LimeAttack are\nhighly transferable and effectively improve model robustness in adversarial\ntraining.\n","authors":["Hai Zhu","Zhaoqing Yang","Weiwei Shang","Yuren Wu"],"pdf_url":"https://arxiv.org/pdf/2308.00319v1.pdf","comment":"26 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00304v1","updated":"2023-08-01T05:54:12Z","published":"2023-08-01T05:54:12Z","title":"Skills-in-Context Prompting: Unlocking Compositionality in Large\n  Language Models","summary":"  We consider the problem of eliciting compositional generalization\ncapabilities in large language models (LLMs) with a novel type of prompting\nstrategy. Compositional generalization empowers the LLMs to solve problems that\nare harder than the ones they have seen (i.e., easy-to-hard generalization),\nwhich is a critical reasoning capability of human-like intelligence. However,\neven the current state-of-the-art LLMs still struggle with this form of\nreasoning. To bridge this gap, we propose skills-in-context (SKiC) prompting,\nwhich instructs LLMs how to compose basic skills to resolve more complex\nproblems. We find that it is crucial to demonstrate both the skills and the\ncompositional examples within the same prompting context. With as few as two\nexamplars, our SKiC prompting initiates strong synergies between skills and\ntheir composition capabilities. Notably, it empowers LLMs to solve unseen\nproblems that require innovative skill compositions, achieving near-perfect\ngeneralization on a broad range of challenging compositionality tasks.\nIntriguingly, SKiC prompting unlocks the latent potential of LLMs, enabling\nthem to leverage pre-existing internal skills acquired during earlier\npretraining and alignment stages, even when these skills are not explicitly\npresented in the prompting context. This results in the capability of LLMs to\nsolve unseen complex problems by activating and composing these internal\ncompetencies.\n","authors":["Jiaao Chen","Xiaoman Pan","Dian Yu","Kaiqiang Song","Xiaoyang Wang","Dong Yu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.00304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14387v2","updated":"2023-08-01T05:46:21Z","published":"2023-05-22T17:55:50Z","title":"AlpacaFarm: A Simulation Framework for Methods that Learn from Human\n  Feedback","summary":"  Large language models (LLMs) such as ChatGPT have seen widespread adoption\ndue to their ability to follow user instructions well. Developing these LLMs\ninvolves a complex yet poorly understood workflow requiring training with human\nfeedback. Replicating and understanding this instruction-following process\nfaces three major challenges: the high cost of data collection, the lack of\ntrustworthy evaluation, and the absence of reference method implementations. We\naddress these challenges with AlpacaFarm, a simulator that enables research and\ndevelopment for learning from feedback at a low cost. First, we design LLM\nprompts to simulate human feedback that are 45x cheaper than crowdworkers and\ndisplay high agreement with humans. Second, we propose an automatic evaluation\nand validate it against human instructions obtained on real-world interactions.\nThird, we contribute reference implementations for several methods (PPO,\nbest-of-n, expert iteration, and more) that learn from pairwise feedback.\nFinally, as an end-to-end validation of AlpacaFarm, we train and evaluate\neleven models on 10k pairs of real human feedback and show that rankings of\nmodels trained in AlpacaFarm match rankings of models trained on human data. As\na demonstration of the research possible in AlpacaFarm, we find that methods\nthat use a reward model can substantially improve over supervised fine-tuning\nand that our reference PPO implementation leads to a +10% improvement in\nwin-rate against Davinci003. We release all components of AlpacaFarm at\nhttps://github.com/tatsu-lab/alpaca_farm.\n","authors":["Yann Dubois","Xuechen Li","Rohan Taori","Tianyi Zhang","Ishaan Gulrajani","Jimmy Ba","Carlos Guestrin","Percy Liang","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2305.14387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00295v1","updated":"2023-08-01T05:28:13Z","published":"2023-08-01T05:28:13Z","title":"Making the V in Text-VQA Matter","summary":"  Text-based VQA aims at answering questions by reading the text present in the\nimages. It requires a large amount of scene-text relationship understanding\ncompared to the VQA task. Recent studies have shown that the question-answer\npairs in the dataset are more focused on the text present in the image but less\nimportance is given to visual features and some questions do not require\nunderstanding the image. The models trained on this dataset predict biased\nanswers due to the lack of understanding of visual context. For example, in\nquestions like \"What is written on the signboard?\", the answer predicted by the\nmodel is always \"STOP\" which makes the model to ignore the image. To address\nthese issues, we propose a method to learn visual features (making V matter in\nTextVQA) along with the OCR features and question features using VQA dataset as\nexternal knowledge for Text-based VQA. Specifically, we combine the TextVQA\ndataset and VQA dataset and train the model on this combined dataset. Such a\nsimple, yet effective approach increases the understanding and correlation\nbetween the image features and text present in the image, which helps in the\nbetter answering of questions. We further test the model on different datasets\nand compare their qualitative and quantitative results.\n","authors":["Shamanthak Hegde","Soumya Jahagirdar","Shankar Gangisetty"],"pdf_url":"https://arxiv.org/pdf/2308.00295v1.pdf","comment":"Accepted for the CVPR 2023 Workshop on Open-Domain Reasoning Under\n  Multi-Modal Settings"},{"id":"http://arxiv.org/abs/2308.00264v1","updated":"2023-08-01T03:54:27Z","published":"2023-08-01T03:54:27Z","title":"Multi-Modality Multi-Loss Fusion Network","summary":"  In this work we investigate the optimal selection and fusion of features\nacross multiple modalities and combine these in a neural network to improve\nemotion detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nuseful findings relating to subnet performance. Our best model achieves\nstate-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and\nCH-SIMS), and outperforms the other methods in most metrics. We have found that\ntraining on multimodal features improves single modality testing and designing\nfusion methods based on dataset annotation schema enhances model performance.\nThese results suggest a roadmap towards an optimized feature selection and\nfusion approach for enhancing emotion detection in neural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v1.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2308.00240v1","updated":"2023-08-01T02:43:27Z","published":"2023-08-01T02:43:27Z","title":"Towards Effective Ancient Chinese Translation: Dataset, Model, and\n  Evaluation","summary":"  Interpreting ancient Chinese has been the key to comprehending vast Chinese\nliterature, tradition, and civilization. In this paper, we propose Erya for\nancient Chinese translation. From a dataset perspective, we collect, clean, and\nclassify ancient Chinese materials from various sources, forming the most\nextensive ancient Chinese resource to date. From a model perspective, we devise\nErya training method oriented towards ancient Chinese. We design two\njointly-working tasks: disyllabic aligned substitution (DAS) and dual masked\nlanguage model (DMLM). From an evaluation perspective, we build a benchmark to\njudge ancient Chinese translation quality in different scenarios and evaluate\nthe ancient Chinese translation capacities of various existing models. Our\nmodel exhibits remarkable zero-shot performance across five domains, with over\n+12.0 BLEU against GPT-3.5 models and better human evaluation results than\nERNIE Bot. Subsequent fine-tuning further shows the superior transfer\ncapability of Erya model with +6.2 BLEU gain. We release all the\nabove-mentioned resources at https://github.com/RUCAIBox/Erya.\n","authors":["Geyang Guo","Jiarong Yang","Fengyuan Lu","Jiaxin Qin","Tianyi Tang","Wayne Xin Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.00240v1.pdf","comment":"Accepted by NLPCC 2023"},{"id":"http://arxiv.org/abs/2307.11760v3","updated":"2023-08-01T01:58:11Z","published":"2023-07-14T00:57:12Z","title":"EmotionPrompt: Leveraging Psychology for Large Language Models\n  Enhancement via Emotional Stimulus","summary":"  Large language models (LLMs) have achieved significant performance in many\nfields such as reasoning, language understanding, and math problem-solving, and\nare regarded as a crucial step to artificial general intelligence (AGI).\nHowever, the sensitivity of LLMs to prompts remains a major bottleneck for\ntheir daily adoption. In this paper, we take inspiration from psychology and\npropose EmotionPrompt to explore emotional intelligence to enhance the\nperformance of LLMs. EmotionPrompt operates on a remarkably straightforward\nprinciple: the incorporation of emotional stimulus into prompts. Experimental\nresults demonstrate that our EmotionPrompt, using the same single prompt\ntemplates, significantly outperforms original zero-shot prompt and\nZero-shot-CoT on 8 tasks with diverse models: ChatGPT, Vicuna-13b, Bloom, and\nT5. Further, EmotionPrompt was observed to improve both truthfulness and\ninformativeness. We believe that EmotionPrompt heralds a novel avenue for\nexploring interdisciplinary knowledge for humans-LLMs interaction.\n","authors":["Cheng Li","Jindong Wang","Kaijie Zhu","Yixuan Zhang","Wenxin Hou","Jianxun Lian","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.11760v3.pdf","comment":"Work in progress; 9 pages"},{"id":"http://arxiv.org/abs/2308.00225v1","updated":"2023-08-01T01:39:25Z","published":"2023-08-01T01:39:25Z","title":"Instructed to Bias: Instruction-Tuned Language Models Exhibit Emergent\n  Cognitive Bias","summary":"  Recent studies show that instruction tuning and learning from human feedback\nimprove the abilities of large language models (LMs) dramatically. While these\ntuning methods can make models generate high-quality text, we conjecture that\nmore implicit cognitive biases may arise in these fine-tuned models. Our work\nprovides evidence that these fine-tuned models exhibit biases that were absent\nor less pronounced in their pretrained predecessors. We examine the extent of\nthis phenomenon in three cognitive biases - the decoy effect, the certainty\neffect, and the belief bias - all of which are known to influence human\ndecision-making and reasoning. Our findings highlight the presence of these\nbiases in various models, especially those that have undergone instruction\ntuning, such as Flan-T5, GPT3.5, and GPT4. This research constitutes a step\ntoward comprehending cognitive biases in instruction-tuned LMs, which is\ncrucial for the development of more reliable and unbiased language models.\n","authors":["Itay Itzhak","Gabriel Stanovsky","Nir Rosenfeld","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2308.00225v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.00221v1","updated":"2023-08-01T01:27:40Z","published":"2023-08-01T01:27:40Z","title":"Advancing Beyond Identification: Multi-bit Watermark for Language Models","summary":"  This study aims to proactively tackle misuse of large language models beyond\nidentification of machine-generated text. While existing methods focus on\ndetection, some malicious misuses demand tracing the adversary user for\ncounteracting them. To address this, we propose \"Multi-bit Watermark through\nColor-listing\" (COLOR), embedding traceable multi-bit information during\nlanguage model generation. Leveraging the benefits of zero-bit watermarking\n(Kirchenbauer et al., 2023a), COLOR enables extraction without model access,\non-the-fly embedding, and maintains text quality, while allowing zero-bit\ndetection all at the same time. Preliminary experiments demonstrates successful\nembedding of 32-bit messages with 91.9% accuracy in moderate-length texts\n($\\sim$500 tokens). This work advances strategies to counter language model\nmisuse effectively.\n","authors":["KiYoon Yoo","Wonhyuk Ahn","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.00221v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.18624v5","updated":"2023-08-01T00:04:32Z","published":"2023-05-29T21:17:52Z","title":"W-procer: Weighted Prototypical Contrastive Learning for Medical\n  Few-Shot Named Entity Recognition","summary":"  Contrastive learning has become a popular solution for few-shot Name Entity\nRecognization (NER). The conventional configuration strives to reduce the\ndistance between tokens with the same labels and increase the distance between\ntokens with different labels. The effect of this setup may, however, in the\nmedical domain, there are a lot of entities annotated as OUTSIDE (O), and they\nare undesirably pushed apart to other entities that are not labeled as OUTSIDE\n(O) by the current contrastive learning method end up with a noisy prototype\nfor the semantic representation of the label, though there are many OUTSIDE (O)\nlabeled entities are relevant to the labeled entities. To address this\nchallenge, we propose a novel method named Weighted Prototypical Contrastive\nLearning for Medical Few Shot Named Entity Recognization (W-PROCER). Our\napproach primarily revolves around constructing the prototype-based contractive\nloss and weighting network. These components play a crucial role in assisting\nthe model in differentiating the negative samples from OUTSIDE (O) tokens and\nenhancing the discrimination ability of contrastive learning. Experimental\nresults show that our proposed W-PROCER framework significantly outperforms the\nstrong baselines on the three medical benchmark datasets.\n","authors":["Mingchen Li","Yang Ye","Jeremy Yeung","Huixue Zhou","Huaiyuan Chu","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.18624v5.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2306.17156v3","updated":"2023-08-01T00:03:25Z","published":"2023-06-29T17:57:40Z","title":"Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4,\n  and Human Tutors","summary":"  Generative AI and large language models hold great promise in enhancing\ncomputing education by powering next-generation educational technologies for\nintroductory programming. Recent works have studied these models for different\nscenarios relevant to programming education; however, these works are limited\nfor several reasons, as they typically consider already outdated models or only\nspecific scenario(s). Consequently, there is a lack of a systematic study that\nbenchmarks state-of-the-art models for a comprehensive set of programming\neducation scenarios. In our work, we systematically evaluate two models,\nChatGPT (based on GPT-3.5) and GPT-4, and compare their performance with human\ntutors for a variety of scenarios. We evaluate using five introductory Python\nprogramming problems and real-world buggy programs from an online platform, and\nassess performance using expert-based annotations. Our results show that GPT-4\ndrastically outperforms ChatGPT (based on GPT-3.5) and comes close to human\ntutors' performance for several scenarios. These results also highlight\nsettings where GPT-4 still struggles, providing exciting future directions on\ndeveloping techniques to improve the performance of these models.\n","authors":["Tung Phung","Victor-Alexandru Pădurean","José Cambronero","Sumit Gulwani","Tobias Kohn","Rupak Majumdar","Adish Singla","Gustavo Soares"],"pdf_url":"https://arxiv.org/pdf/2306.17156v3.pdf","comment":"This article is a full version of the poster (extended abstract) from\n  ICER'23"},{"id":"http://arxiv.org/abs/2308.00878v1","updated":"2023-08-01T23:29:16Z","published":"2023-08-01T23:29:16Z","title":"DiactTOD: Learning Generalizable Latent Dialogue Acts for Controllable\n  Task-Oriented Dialogue Systems","summary":"  Dialogue act annotations are important to improve response generation quality\nin task-oriented dialogue systems. However, it can be challenging to use\ndialogue acts to control response generation in a generalizable way because\ndifferent datasets and tasks may have incompatible annotations. While\nalternative methods that utilize latent action spaces or reinforcement learning\ndo not require explicit annotations, they may lack interpretability or face\ndifficulties defining task-specific rewards. In this work, we present a novel\nend-to-end latent dialogue act model (DiactTOD) that represents dialogue acts\nin a latent space. DiactTOD, when pre-trained on a large corpus, is able to\npredict and control dialogue acts to generate controllable responses using\nthese latent representations in a zero-shot fashion. Our approach demonstrates\nstate-of-the-art performance across a wide range of experimental settings on\nthe MultiWOZ dataset, including zero-shot, few-shot, and full data fine-tuning\nwith both end-to-end and policy optimization configurations.\n","authors":["Qingyang Wu","James Gung","Raphael Shu","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00878v1.pdf","comment":"SIGDial 2023"},{"id":"http://arxiv.org/abs/2210.12187v2","updated":"2023-08-01T22:23:09Z","published":"2022-10-21T18:30:56Z","title":"Syntactic Surprisal From Neural Models Predicts, But Underestimates,\n  Human Processing Difficulty From Syntactic Ambiguities","summary":"  Humans exhibit garden path effects: When reading sentences that are\ntemporarily structurally ambiguous, they slow down when the structure is\ndisambiguated in favor of the less preferred alternative. Surprisal theory\n(Hale, 2001; Levy, 2008), a prominent explanation of this finding, proposes\nthat these slowdowns are due to the unpredictability of each of the words that\noccur in these sentences. Challenging this hypothesis, van Schijndel & Linzen\n(2021) find that estimates of the cost of word predictability derived from\nlanguage models severely underestimate the magnitude of human garden path\neffects. In this work, we consider whether this underestimation is due to the\nfact that humans weight syntactic factors in their predictions more highly than\nlanguage models do. We propose a method for estimating syntactic predictability\nfrom a language model, allowing us to weigh the cost of lexical and syntactic\npredictability independently. We find that treating syntactic predictability\nindependently from lexical predictability indeed results in larger estimates of\ngarden path. At the same time, even when syntactic predictability is\nindependently weighted, surprisal still greatly underestimate the magnitude of\nhuman garden path effects. Our results support the hypothesis that\npredictability is not the only factor responsible for the processing cost\nassociated with garden path sentences.\n","authors":["Suhas Arehalli","Brian Dillon","Tal Linzen"],"pdf_url":"https://arxiv.org/pdf/2210.12187v2.pdf","comment":"13 pages (4 references + appendix), 6 figures. To appear in the\n  proceedings of the 2022 SIGNLL Conference on Computational Natural Language\n  Learning. Revised after fixing errors in computing syntactic surprisal. The\n  fix resulted in an increase in the NPZ GP effect observed and no evidence for\n  a correlation between syntactic surprisal and word frequency. The main\n  findings are unchanged"},{"id":"http://arxiv.org/abs/2305.07804v4","updated":"2023-08-01T20:27:56Z","published":"2023-05-12T23:49:23Z","title":"Improving Small Language Models on PubMedQA via Generative Data\n  Augmentation","summary":"  Large Language Models (LLMs) have made remarkable advancements in the field\nof natural language processing. However, their increasing size poses challenges\nin terms of computational cost. On the other hand, Small Language Models (SLMs)\nare known for their efficiency, but they often struggle with limited capacity\nand training data, especially in specific domains. In this paper, we introduce\na novel method aimed at improving SLMs in the medical domain using LLM-based\ngenerative data augmentation. The objective of our approach is to develop more\nefficient and capable models that are specifically tailored for specialized\napplications. Through experiments conducted on the PubMedQA dataset, we\ndemonstrate the effectiveness of LLMs in refining and diversifying existing\nquestion-answer pairs. This refinement process leads to improved performance in\na significantly smaller model after fine-tuning. Notably, our best SLM, with\nunder 1.6 billion parameters, outperforms the few-shot GPT-4 on the PubMedQA\ndataset. Our code and generated data are publicly available to facilitate\nfurther explorations.\n","authors":["Zhen Guo","Peiqi Wang","Yanwei Wang","Shangdi Yu"],"pdf_url":"https://arxiv.org/pdf/2305.07804v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00802v1","updated":"2023-08-01T19:34:18Z","published":"2023-08-01T19:34:18Z","title":"GRDD: A Dataset for Greek Dialectal NLP","summary":"  In this paper, we present a dataset for the computational study of a number\nof Modern Greek dialects. It consists of raw text data from four dialects of\nModern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is\nof considerable size, albeit imbalanced, and presents the first attempt to\ncreate large scale dialectal resources of this type for Modern Greek dialects.\nWe then use the dataset to perform dialect idefntification. We experiment with\ntraditional ML algorithms, as well as simple DL architectures. The results show\nvery good performance on the task, potentially revealing that the dialects in\nquestion have distinct enough characteristics allowing even simple ML models to\nperform well on the task. Error analysis is performed for the top performing\nalgorithms showing that in a number of cases the errors are due to insufficient\ndataset cleaning.\n","authors":["Stergios Chatzikyriakidis","Chatrine Qwaider","Ilias Kolokousis","Christina Koula","Dimitris Papadakis","Efthymia Sakellariou"],"pdf_url":"https://arxiv.org/pdf/2308.00802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07229v2","updated":"2023-08-01T18:41:52Z","published":"2022-10-13T17:55:53Z","title":"Mass-Editing Memory in a Transformer","summary":"  Recent work has shown exciting promise in updating large language models with\nnew memories, so as to replace obsolete information or add specialized\nknowledge. However, this line of work is predominantly limited to updating\nsingle associations. We develop MEMIT, a method for directly updating a\nlanguage model with many memories, demonstrating experimentally that it can\nscale up to thousands of associations for GPT-J (6B) and GPT-NeoX (20B),\nexceeding prior work by orders of magnitude. Our code and data are at\nhttps://memit.baulab.info.\n","authors":["Kevin Meng","Arnab Sen Sharma","Alex Andonian","Yonatan Belinkov","David Bau"],"pdf_url":"https://arxiv.org/pdf/2210.07229v2.pdf","comment":"18 pages, 11 figures. Code and data at https://memit.baulab.info"},{"id":"http://arxiv.org/abs/2307.00925v2","updated":"2023-08-01T18:12:04Z","published":"2023-07-03T10:53:05Z","title":"Automatic Design of Semantic Similarity Ensembles Using Grammatical\n  Evolution","summary":"  Semantic similarity measures are widely used in natural language processing\nto catalyze various computer-related tasks. However, no single semantic\nsimilarity measure is the most appropriate for all tasks, and researchers often\nuse ensemble strategies to ensure performance. This research work proposes a\nmethod for automatically designing semantic similarity ensembles. In fact, our\nproposed method uses grammatical evolution, for the first time, to\nautomatically select and aggregate measures from a pool of candidates to create\nan ensemble that maximizes correlation to human judgment. The method is\nevaluated on several benchmark datasets and compared to state-of-the-art\nensembles, showing that it can significantly improve similarity assessment\naccuracy and outperform existing methods in some cases. As a result, our\nresearch demonstrates the potential of using grammatical evolution to\nautomatically compare text and prove the benefits of using ensembles for\nsemantic similarity tasks. The source code that illustrates our approach can be\ndownloaded from https://github.com/jorge-martinez-gil/sesige.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.00925v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.00762v1","updated":"2023-08-01T18:01:21Z","published":"2023-08-01T18:01:21Z","title":"Self-Supervised Contrastive BERT Fine-tuning for Fusion-based\n  Reviewed-Item Retrieval","summary":"  As natural language interfaces enable users to express increasingly complex\nnatural language queries, there is a parallel explosion of user review content\nthat can allow users to better find items such as restaurants, books, or movies\nthat match these expressive queries. While Neural Information Retrieval (IR)\nmethods have provided state-of-the-art results for matching queries to\ndocuments, they have not been extended to the task of Reviewed-Item Retrieval\n(RIR), where query-review scores must be aggregated (or fused) into item-level\nscores for ranking. In the absence of labeled RIR datasets, we extend Neural IR\nmethodology to RIR by leveraging self-supervised methods for contrastive\nlearning of BERT embeddings for both queries and reviews. Specifically,\ncontrastive learning requires a choice of positive and negative samples, where\nthe unique two-level structure of our item-review data combined with meta-data\naffords us a rich structure for the selection of these samples. For contrastive\nlearning in a Late Fusion scenario, we investigate the use of positive review\nsamples from the same item and/or with the same rating, selection of hard\npositive samples by choosing the least similar reviews from the same anchor\nitem, and selection of hard negative samples by choosing the most similar\nreviews from different items. We also explore anchor sub-sampling and\naugmenting with meta-data. For a more end-to-end Early Fusion approach, we\nintroduce contrastive item embedding learning to fuse reviews into single item\nembeddings. Experimental results show that Late Fusion contrastive learning for\nNeural RIR outperforms all other contrastive IR configurations, Neural IR, and\nsparse retrieval baselines, thus demonstrating the power of exploiting the\ntwo-level structure in Neural RIR approaches as well as the importance of\npreserving the nuance of individual review content via Late Fusion methods.\n","authors":["Mohammad Mahdi Abdollah Pour","Parsa Farinneya","Armin Toroghi","Anton Korikov","Ali Pesaranghader","Touqir Sajed","Manasa Bharadwaj","Borislav Mavrin","Scott Sanner"],"pdf_url":"https://arxiv.org/pdf/2308.00762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00755v1","updated":"2023-08-01T18:00:08Z","published":"2023-08-01T18:00:08Z","title":"The Bias Amplification Paradox in Text-to-Image Generation","summary":"  Bias amplification is a phenomenon in which models increase imbalances\npresent in the training data. In this paper, we study bias amplification in the\ntext-to-image domain using Stable Diffusion by comparing gender ratios in\ntraining vs. generated images. We find that the model appears to amplify\ngender-occupation biases found in the training data (LAION). However, we\ndiscover that amplification can largely be attributed to discrepancies between\ntraining captions and model prompts. For example, an inherent difference is\nthat captions from the training data often contain explicit gender information\nwhile the prompts we use do not, which leads to a distribution shift and\nconsequently impacts bias measures. Once we account for various distributional\ndifferences between texts used for training and generation, we observe that\namplification decreases considerably. Our findings illustrate the challenges of\ncomparing biases in models and the data they are trained on, and highlight\nconfounding factors that contribute to bias amplification.\n","authors":["Preethi Seshadri","Sameer Singh","Yanai Elazar"],"pdf_url":"https://arxiv.org/pdf/2308.00755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01423v1","updated":"2023-08-01T02:08:13Z","published":"2023-08-01T02:08:13Z","title":"ChatMOF: An Autonomous AI System for Predicting and Generating\n  Metal-Organic Frameworks","summary":"  ChatMOF is an autonomous Artificial Intelligence (AI) system that is built to\npredict and generate of metal-organic frameworks (MOFs). By leveraging a\nlarge-scale language model (gpt-3.5-turbo), ChatMOF extracts key details from\ntextual inputs and delivers appropriate responses, thus eliminating the\nnecessity for rigid structured queries. The system is comprised of three core\ncomponents (i.e. an agent, a toolkit, and an evaluator) and it forms a robust\npipeline that manages a variety of tasks, including data retrieval, property\nprediction, and structure generation. The study further explores the merits and\nconstraints of using large language models (LLMs) AI system in material\nsciences using and showcases its transformative potential for future\nadvancements.\n","authors":["Yeonghun Kang","Jihan Kim"],"pdf_url":"https://arxiv.org/pdf/2308.01423v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.00692v1","updated":"2023-08-01T17:50:17Z","published":"2023-08-01T17:50:17Z","title":"LISA: Reasoning Segmentation via Large Language Model","summary":"  Although perception systems have made remarkable advancements in recent\nyears, they still rely on explicit human instruction to identify the target\nobjects or categories before executing visual recognition tasks. Such systems\nlack the ability to actively reason and comprehend implicit user intentions. In\nthis work, we propose a new segmentation task -- reasoning segmentation. The\ntask is designed to output a segmentation mask given a complex and implicit\nquery text. Furthermore, we establish a benchmark comprising over one thousand\nimage-instruction pairs, incorporating intricate reasoning and world knowledge\nfor evaluation purposes. Finally, we present LISA: large Language Instructed\nSegmentation Assistant, which inherits the language generation capabilities of\nthe multi-modal Large Language Model (LLM) while also possessing the ability to\nproduce segmentation masks. We expand the original vocabulary with a <SEG>\ntoken and propose the embedding-as-mask paradigm to unlock the segmentation\ncapability. Remarkably, LISA can handle cases involving: 1) complex reasoning;\n2) world knowledge; 3) explanatory answers; 4) multi-turn conversation. Also,\nit demonstrates robust zero-shot capability when trained exclusively on\nreasoning-free datasets. In addition, fine-tuning the model with merely 239\nreasoning segmentation image-instruction pairs results in further performance\nenhancement. Experiments show our method not only unlocks new reasoning\nsegmentation capabilities but also proves effective in both complex reasoning\nsegmentation and standard referring segmentation tasks. Code, models, and demo\nare at https://github.com/dvlab-research/LISA.\n","authors":["Xin Lai","Zhuotao Tian","Yukang Chen","Yanwei Li","Yuhui Yuan","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2308.00692v1.pdf","comment":"Code, models, and demo are at https://github.com/dvlab-research/LISA"},{"id":"http://arxiv.org/abs/2308.00688v1","updated":"2023-08-01T17:45:13Z","published":"2023-08-01T17:45:13Z","title":"AnyLoc: Towards Universal Visual Place Recognition","summary":"  Visual Place Recognition (VPR) is vital for robot localization. To date, the\nmost performant VPR approaches are environment- and task-specific: while they\nexhibit strong performance in structured environments (predominantly urban\ndriving), their performance degrades severely in unstructured environments,\nrendering most approaches brittle to robust real-world deployment. In this\nwork, we develop a universal solution to VPR -- a technique that works across a\nbroad range of structured and unstructured environments (urban, outdoors,\nindoors, aerial, underwater, and subterranean environments) without any\nre-training or fine-tuning. We demonstrate that general-purpose feature\nrepresentations derived from off-the-shelf self-supervised models with no\nVPR-specific training are the right substrate upon which to build such a\nuniversal VPR solution. Combining these derived features with unsupervised\nfeature aggregation enables our suite of methods, AnyLoc, to achieve up to 4X\nsignificantly higher performance than existing approaches. We further obtain a\n6% improvement in performance by characterizing the semantic properties of\nthese features, uncovering unique domains which encapsulate datasets from\nsimilar environments. Our detailed experiments and analysis lay a foundation\nfor building VPR solutions that may be deployed anywhere, anytime, and across\nanyview. We encourage the readers to explore our project page and interactive\ndemos: https://anyloc.github.io/.\n","authors":["Nikhil Keetha","Avneesh Mishra","Jay Karhade","Krishna Murthy Jatavallabhula","Sebastian Scherer","Madhava Krishna","Sourav Garg"],"pdf_url":"https://arxiv.org/pdf/2308.00688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06607v2","updated":"2023-08-01T17:44:28Z","published":"2023-07-13T08:03:32Z","title":"Image Denoising and the Generative Accumulation of Photons","summary":"  We present a fresh perspective on shot noise corrupted images and noise\nremoval. By viewing image formation as the sequential accumulation of photons\non a detector grid, we show that a network trained to predict where the next\nphoton could arrive is in fact solving the minimum mean square error (MMSE)\ndenoising task. This new perspective allows us to make three contributions: We\npresent a new strategy for self-supervised denoising, We present a new method\nfor sampling from the posterior of possible solutions by iteratively sampling\nand adding small numbers of photons to the image. We derive a full generative\nmodel by starting this process from an empty canvas. We call this approach\ngenerative accumulation of photons (GAP). We evaluate our method quantitatively\nand qualitatively on 4 new fluorescence microscopy datasets, which will be made\navailable to the community. We find that it outperforms supervised,\nself-supervised and unsupervised baselines or performs on-par.\n","authors":["Alexander Krull","Hector Basevi","Benjamin Salmon","Andre Zeug","Franziska Müller","Samuel Tonks","Leela Muppala","Ales Leonardis"],"pdf_url":"https://arxiv.org/pdf/2307.06607v2.pdf","comment":"Paper with supplement. Typos corrected"},{"id":"http://arxiv.org/abs/2308.00678v1","updated":"2023-08-01T17:31:14Z","published":"2023-08-01T17:31:14Z","title":"Applicability of scaling laws to vision encoding models","summary":"  In this paper, we investigated how to build a high-performance vision\nencoding model to predict brain activity as part of our participation in the\nAlgonauts Project 2023 Challenge. The challenge provided brain activity\nrecorded by functional MRI (fMRI) while participants viewed images. Several\nvision models with parameter sizes ranging from 86M to 4.3B were used to build\npredictive models. To build highly accurate models, we focused our analysis on\ntwo main aspects: (1) How does the sample size of the fMRI training set change\nthe prediction accuracy? (2) How does the prediction accuracy across the visual\ncortex vary with the parameter size of the vision models? The results show that\nas the sample size used during training increases, the prediction accuracy\nimproves according to the scaling law. Similarly, we found that as the\nparameter size of the vision models increases, the prediction accuracy improves\naccording to the scaling law. These results suggest that increasing the sample\nsize of the fMRI training set and the parameter size of visual models may\ncontribute to more accurate visual models of the brain and lead to a better\nunderstanding of visual neuroscience.\n","authors":["Takuya Matsuyama","Kota S Sasaki","Shinji Nishimoto"],"pdf_url":"https://arxiv.org/pdf/2308.00678v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.00675v1","updated":"2023-08-01T17:21:38Z","published":"2023-08-01T17:21:38Z","title":"Tool Documentation Enables Zero-Shot Tool-Usage with Large Language\n  Models","summary":"  Today, large language models (LLMs) are taught to use new tools by providing\na few demonstrations of the tool's usage. Unfortunately, demonstrations are\nhard to acquire, and can result in undesirable biased usage if the wrong\ndemonstration is chosen. Even in the rare scenario that demonstrations are\nreadily available, there is no principled selection protocol to determine how\nmany and which ones to provide. As tasks grow more complex, the selection\nsearch grows combinatorially and invariably becomes intractable. Our work\nprovides an alternative to demonstrations: tool documentation. We advocate the\nuse of tool documentation, descriptions for the individual tool usage, over\ndemonstrations. We substantiate our claim through three main empirical findings\non 6 tasks across both vision and language modalities. First, on existing\nbenchmarks, zero-shot prompts with only tool documentation are sufficient for\neliciting proper tool usage, achieving performance on par with few-shot\nprompts. Second, on a newly collected realistic tool-use dataset with hundreds\nof available tool APIs, we show that tool documentation is significantly more\nvaluable than demonstrations, with zero-shot documentation significantly\noutperforming few-shot without documentation. Third, we highlight the benefits\nof tool documentations by tackling image generation and video tracking using\njust-released unseen state-of-the-art models as tools. Finally, we highlight\nthe possibility of using tool documentation to automatically enable new\napplications: by using nothing more than the documentation of GroundingDino,\nStable Diffusion, XMem, and SAM, LLMs can re-invent the functionalities of the\njust-released Grounded-SAM and Track Anything models.\n","authors":["Cheng-Yu Hsieh","Si-An Chen","Chun-Liang Li","Yasuhisa Fujii","Alexander Ratner","Chen-Yu Lee","Ranjay Krishna","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2308.00675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09309v4","updated":"2023-08-01T17:20:28Z","published":"2022-10-18T00:55:37Z","title":"RibSeg v2: A Large-scale Benchmark for Rib Labeling and Anatomical\n  Centerline Extraction","summary":"  Automatic rib labeling and anatomical centerline extraction are common\nprerequisites for various clinical applications. Prior studies either use\nin-house datasets that are inaccessible to communities, or focus on rib\nsegmentation that neglects the clinical significance of rib labeling. To\naddress these issues, we extend our prior dataset (RibSeg) on the binary rib\nsegmentation task to a comprehensive benchmark, named RibSeg v2, with 660 CT\nscans (15,466 individual ribs in total) and annotations manually inspected by\nexperts for rib labeling and anatomical centerline extraction. Based on the\nRibSeg v2, we develop a pipeline including deep learning-based methods for rib\nlabeling, and a skeletonization-based method for centerline extraction. To\nimprove computational efficiency, we propose a sparse point cloud\nrepresentation of CT scans and compare it with standard dense voxel grids.\nMoreover, we design and analyze evaluation metrics to address the key\nchallenges of each task. Our dataset, code, and model are available online to\nfacilitate open research at https://github.com/M3DV/RibSeg\n","authors":["Liang Jin","Shixuan Gu","Donglai Wei","Jason Ken Adhinarta","Kaiming Kuang","Yongjie Jessica Zhang","Hanspeter Pfister","Bingbing Ni","Jiancheng Yang","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2210.09309v4.pdf","comment":"10 pages, 6 figures, journal"},{"id":"http://arxiv.org/abs/2301.09522v2","updated":"2023-08-01T17:15:30Z","published":"2023-01-23T16:14:09Z","title":"Optimising Event-Driven Spiking Neural Network with Regularisation and\n  Cutoff","summary":"  Spiking neural networks (SNNs), a variant of artificial neural networks\n(ANNs) with the benefit of energy efficiency, have achieved the accuracy close\nto its ANN counterparts, on benchmark datasets such as CIFAR10/100 and\nImageNet. However, comparing with frame-based input (e.g., images), event-based\ninputs from e.g., Dynamic Vision Sensor (DVS) can make a better use of SNNs\nthanks to the SNNs' asynchronous working mechanism. In this paper, we\nstrengthen the marriage between SNNs and event-based inputs with a proposal to\nconsider anytime optimal inference SNNs, or AOI-SNNs, which can terminate\nanytime during the inference to achieve optimal inference result. Two novel\noptimisation techniques are presented to achieve AOI-SNNs: a regularisation and\na cutoff. The regularisation enables the training and construction of SNNs with\noptimised performance, and the cutoff technique optimises the inference of SNNs\non event-driven inputs. We conduct an extensive set of experiments on multiple\nbenchmark event-based datasets, including CIFAR10-DVS, N-Caltech101 and DVS128\nGesture. The experimental results demonstrate that our techniques are superior\nto the state-of-the-art with respect to the accuracy and latency.\n","authors":["Dengyu Wu","Gaojie Jin","Han Yu","Xinping Yi","Xiaowei Huang"],"pdf_url":"https://arxiv.org/pdf/2301.09522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v2","updated":"2023-08-01T17:12:55Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models with Explicit Transition Probability","summary":"  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities\nof generated content, however, they often suffer from complex forward\nprocesses, resulting in inefficient solutions for the reversed process and\nprolonged sampling times. In this paper, we aim to address the aforementioned\nchallenges by focusing on the diffusion process itself that we propose to\ndecouple the intricate diffusion process into two comparatively simpler process\nto improve the generative efficacy and speed. In particular, we present a novel\ndiffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito\ndiffusion process, in which the image distribution is approximated by an\nexplicit transition probability while the noise path is controlled by the\nstandard Wiener process. We find that decoupling the diffusion process reduces\nthe learning difficulty and the explicit transition probability improves the\ngenerative speed significantly. We prove a new training objective for DPM,\nwhich enables the model to learn to predict the noise and image components\nseparately. Moreover, given the novel forward diffusion equation, we derive the\nreverse denoising formula of DDM that naturally supports fewer steps of\ngeneration without ordinary differential equation (ODE) based accelerators. Our\nexperiments demonstrate that DDM outperforms previous DPMs by a large margin in\nfewer function evaluations setting and gets comparable performances in long\nfunction evaluations setting. We also show that our framework can be applied to\nimage-conditioned generation and high-resolution image synthesis, and that it\ncan generate high-quality images with only 10 function evaluations.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00655v1","updated":"2023-08-01T16:41:30Z","published":"2023-08-01T16:41:30Z","title":"Toward Zero-shot Character Recognition: A Gold Standard Dataset with\n  Radical-level Annotations","summary":"  Optical character recognition (OCR) methods have been applied to diverse\ntasks, e.g., street view text recognition and document analysis. Recently,\nzero-shot OCR has piqued the interest of the research community because it\nconsiders a practical OCR scenario with unbalanced data distribution. However,\nthere is a lack of benchmarks for evaluating such zero-shot methods that apply\na divide-and-conquer recognition strategy by decomposing characters into\nradicals. Meanwhile, radical recognition, as another important OCR task, also\nlacks radical-level annotation for model training. In this paper, we construct\nan ancient Chinese character image dataset that contains both radical-level and\ncharacter-level annotations to satisfy the requirements of the above-mentioned\nmethods, namely, ACCID, where radical-level annotations include radical\ncategories, radical locations, and structural relations. To increase the\nadaptability of ACCID, we propose a splicing-based synthetic character\nalgorithm to augment the training samples and apply an image denoising method\nto improve the image quality. By introducing character decomposition and\nrecombination, we propose a baseline method for zero-shot OCR. The experimental\nresults demonstrate the validity of ACCID and the baseline model quantitatively\nand qualitatively.\n","authors":["Xiaolei Diao","Daqian Shi","Jian Li","Lida Shi","Mingzhe Yue","Ruihua Qi","Chuntao Li","Hao Xu"],"pdf_url":"https://arxiv.org/pdf/2308.00655v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.15829v2","updated":"2023-08-01T16:18:59Z","published":"2023-07-28T22:20:52Z","title":"Seeing Behind Dynamic Occlusions with Event Cameras","summary":"  Unwanted camera occlusions, such as debris, dust, rain-drops, and snow, can\nseverely degrade the performance of computer-vision systems. Dynamic occlusions\nare particularly challenging because of the continuously changing pattern.\nExisting occlusion-removal methods currently use synthetic aperture imaging or\nimage inpainting. However, they face issues with dynamic occlusions as these\nrequire multiple viewpoints or user-generated masks to hallucinate the\nbackground intensity. We propose a novel approach to reconstruct the background\nfrom a single viewpoint in the presence of dynamic occlusions. Our solution\nrelies for the first time on the combination of a traditional camera with an\nevent camera. When an occlusion moves across a background image, it causes\nintensity changes that trigger events. These events provide additional\ninformation on the relative intensity changes between foreground and background\nat a high temporal resolution, enabling a truer reconstruction of the\nbackground content. We present the first large-scale dataset consisting of\nsynchronized images and event sequences to evaluate our approach. We show that\nour method outperforms image inpainting methods by 3dB in terms of PSNR on our\ndataset.\n","authors":["Rong Zou","Manasi Muglikar","Nico Messikommer","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2307.15829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00628v1","updated":"2023-08-01T15:55:41Z","published":"2023-08-01T15:55:41Z","title":"Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation\n  in Outdoor Scenes","summary":"  3D human pose estimation in outdoor environments has garnered increasing\nattention recently. However, prevalent 3D human pose datasets pertaining to\noutdoor scenes lack diversity, as they predominantly utilize only one type of\nmodality (RGB image or pointcloud), and often feature only one individual\nwithin each scene. This limited scope of dataset infrastructure considerably\nhinders the variability of available data. In this article, we propose\nHuman-M3, an outdoor multi-modal multi-view multi-person human pose database\nwhich includes not only multi-view RGB videos of outdoor scenes but also\ncorresponding pointclouds. In order to obtain accurate human poses, we propose\nan algorithm based on multi-modal data input to generate ground truth\nannotation. This benefits from robust pointcloud detection and tracking, which\nsolves the problem of inaccurate human localization and matching ambiguity that\nmay exist in previous multi-view RGB videos in outdoor multi-person scenes, and\ngenerates reliable ground truth annotations. Evaluation of multiple different\nmodalities algorithms has shown that this database is challenging and suitable\nfor future research. Furthermore, we propose a 3D human pose estimation\nalgorithm based on multi-modal data input, which demonstrates the advantages of\nmulti-modal data input for 3D human pose estimation. Code and data will be\nreleased on https://github.com/soullessrobot/Human-M3-Dataset.\n","authors":["Bohao Fan","Siqi Wang","Wenzhao Zheng","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.00628v1.pdf","comment":"Code and data will be released on\n  https://github.com/soullessrobot/Human-M3-Dataset"},{"id":"http://arxiv.org/abs/2308.00622v1","updated":"2023-08-01T15:49:40Z","published":"2023-08-01T15:49:40Z","title":"NeRT: Implicit Neural Representations for General Unsupervised\n  Turbulence Mitigation","summary":"  The atmospheric and water turbulence mitigation problems have emerged as\nchallenging inverse problems in computer vision and optics communities over the\nyears. However, current methods either rely heavily on the quality of the\ntraining dataset or fail to generalize over various scenarios, such as static\nscenes, dynamic scenes, and text reconstructions. We propose a general implicit\nneural representation for unsupervised atmospheric and water turbulence\nmitigation (NeRT). NeRT leverages the implicit neural representations and the\nphysically correct tilt-then-blur turbulence model to reconstruct the clean,\nundistorted image, given only dozens of distorted input images. Moreover, we\nshow that NeRT outperforms the state-of-the-art through various qualitative and\nquantitative evaluations of atmospheric and water turbulence datasets.\nFurthermore, we demonstrate the ability of NeRT to eliminate uncontrolled\nturbulence from real-world environments. Lastly, we incorporate NeRT into\ncontinuously captured video sequences and demonstrate $48 \\times$ speedup.\n","authors":["Weiyun Jiang","Vivek Boominathan","Ashok Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2308.00622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00608v1","updated":"2023-08-01T15:35:06Z","published":"2023-08-01T15:35:06Z","title":"Explainable Cost-Sensitive Deep Neural Networks for Brain Tumor\n  Detection from Brain MRI Images considering Data Imbalance","summary":"  This paper presents a research study on the use of Convolutional Neural\nNetwork (CNN), ResNet50, InceptionV3, EfficientNetB0 and NASNetMobile models to\nefficiently detect brain tumors in order to reduce the time required for manual\nreview of the report and create an automated system for classifying brain\ntumors. An automated pipeline is proposed, which encompasses five models: CNN,\nResNet50, InceptionV3, EfficientNetB0 and NASNetMobile. The performance of the\nproposed architecture is evaluated on a balanced dataset and found to yield an\naccuracy of 99.33% for fine-tuned InceptionV3 model. Furthermore, Explainable\nAI approaches are incorporated to visualize the model's latent behavior in\norder to understand its black box behavior. To further optimize the training\nprocess, a cost-sensitive neural network approach has been proposed in order to\nwork with imbalanced datasets which has achieved almost 4% more accuracy than\nthe conventional models used in our experiments. The cost-sensitive InceptionV3\n(CS-InceptionV3) and CNN (CS-CNN) show a promising accuracy of 92.31% and a\nrecall value of 1.00 respectively on an imbalanced dataset. The proposed models\nhave shown great potential in improving tumor detection accuracy and must be\nfurther developed for application in practical solutions. We have provided the\ndatasets and made our implementations publicly available at -\nhttps://github.com/shahariar-shibli/Explainable-Cost-Sensitive-Deep-Neural-Networks-for-Brain-Tumor-Detection-from-Brain-MRI-Images\n","authors":["Md Tanvir Rouf Shawon","G. M. Shahariar Shibli","Farzad Ahmed","Sajib Kumar Saha Joy"],"pdf_url":"https://arxiv.org/pdf/2308.00608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00607v1","updated":"2023-08-01T15:34:02Z","published":"2023-08-01T15:34:02Z","title":"Beyond One-Hot-Encoding: Injecting Semantics to Drive Image Classifiers","summary":"  Images are loaded with semantic information that pertains to real-world\nontologies: dog breeds share mammalian similarities, food pictures are often\ndepicted in domestic environments, and so on. However, when training machine\nlearning models for image classification, the relative similarities amongst\nobject classes are commonly paired with one-hot-encoded labels. According to\nthis logic, if an image is labelled as 'spoon', then 'tea-spoon' and 'shark'\nare equally wrong in terms of training loss. To overcome this limitation, we\nexplore the integration of additional goals that reflect ontological and\nsemantic knowledge, improving model interpretability and trustworthiness. We\nsuggest a generic approach that allows to derive an additional loss term\nstarting from any kind of semantic information about the classification label.\nFirst, we show how to apply our approach to ontologies and word embeddings, and\ndiscuss how the resulting information can drive a supervised learning process.\nSecond, we use our semantically enriched loss to train image classifiers, and\nanalyse the trade-offs between accuracy, mistake severity, and learned internal\nrepresentations. Finally, we discuss how this approach can be further exploited\nin terms of explainability and adversarial robustness. Code repository:\nhttps://github.com/S1M0N38/semantic-encodings\n","authors":["Alan Perotti","Simone Bertolotto","Eliana Pastor","André Panisson"],"pdf_url":"https://arxiv.org/pdf/2308.00607v1.pdf","comment":"This work has been accepted to be presented to The 1st World\n  Conference on eXplainable Artificial Intelligence (xAI 2023), July 26-28,\n  2023 - Lisboa, Portugal"},{"id":"http://arxiv.org/abs/2307.12622v3","updated":"2023-08-01T15:23:56Z","published":"2023-07-24T08:51:49Z","title":"Phase Matching for Out-of-Distribution Generalization","summary":"  The Fourier transform, serving as an explicit decomposition method for visual\nsignals, has been employed to explain the out-of-distribution generalization\nbehaviors of Convolutional Neural Networks (CNNs). Previous studies have\nindicated that the amplitude spectrum is susceptible to the disturbance caused\nby distribution shifts. On the other hand, the phase spectrum preserves\nhighly-structured spatial information, which is crucial for robust visual\nrepresentation learning. However, the spatial relationships of phase spectrum\nremain unexplored in previous researches. In this paper, we aim to clarify the\nrelationships between Domain Generalization (DG) and the frequency components,\nand explore the spatial relationships of the phase spectrum. Specifically, we\nfirst introduce a Fourier-based structural causal model which interprets the\nphase spectrum as semi-causal factors and the amplitude spectrum as non-causal\nfactors. Then, we propose Phase Matching (PhaMa) to address DG problems. Our\nmethod introduces perturbations on the amplitude spectrum and establishes\nspatial relationships to match the phase components. Through experiments on\nmultiple benchmarks, we demonstrate that our proposed method achieves\nstate-of-the-art performance in domain generalization and out-of-distribution\nrobustness tasks.\n","authors":["Chengming Hu","Yeqian Du","Rui Wang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12622v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00596v1","updated":"2023-08-01T15:15:40Z","published":"2023-08-01T15:15:40Z","title":"MonoNext: A 3D Monocular Object Detection with ConvNext","summary":"  Autonomous driving perception tasks rely heavily on cameras as the primary\nsensor for Object Detection, Semantic Segmentation, Instance Segmentation, and\nObject Tracking. However, RGB images captured by cameras lack depth\ninformation, which poses a significant challenge in 3D detection tasks. To\nsupplement this missing data, mapping sensors such as LIDAR and RADAR are used\nfor accurate 3D Object Detection. Despite their significant accuracy, the\nmulti-sensor models are expensive and require a high computational demand. In\ncontrast, Monocular 3D Object Detection models are becoming increasingly\npopular, offering a faster, cheaper, and easier-to-implement solution for 3D\ndetections. This paper introduces a different Multi-Tasking Learning approach\ncalled MonoNext that utilizes a spatial grid to map objects in the scene.\nMonoNext employs a straightforward approach based on the ConvNext network and\nrequires only 3D bounding box annotated data. In our experiments with the KITTI\ndataset, MonoNext achieved high precision and competitive performance\ncomparable with state-of-the-art approaches. Furthermore, by adding more\ntraining data, MonoNext surpassed itself and achieved higher accuracies.\n","authors":["Marcelo Eduardo Pederiva","José Mario De Martino","Alessandro Zimmer"],"pdf_url":"https://arxiv.org/pdf/2308.00596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00591v1","updated":"2023-08-01T15:07:38Z","published":"2023-08-01T15:07:38Z","title":"Visibility Enhancement for Low-light Hazy Scenarios","summary":"  Low-light hazy scenes commonly appear at dusk and early morning. The visual\nenhancement for low-light hazy images is an ill-posed problem. Even though\nnumerous methods have been proposed for image dehazing and low-light\nenhancement respectively, simply integrating them cannot deliver pleasing\nresults for this particular task. In this paper, we present a novel method to\nenhance visibility for low-light hazy scenarios. To handle this challenging\ntask, we propose two key techniques, namely cross-consistency\ndehazing-enhancement framework and physically based simulation for low-light\nhazy dataset. Specifically, the framework is designed for enhancing visibility\nof the input image via fully utilizing the clues from different sub-tasks. The\nsimulation is designed for generating the dataset with ground-truths by the\nproposed low-light hazy imaging model. The extensive experimental results show\nthat the proposed method outperforms the SOTA solutions on different metrics\nincluding SSIM (9.19%) and PSNR(5.03%). In addition, we conduct a user study on\nreal images to demonstrate the effectiveness and necessity of the proposed\nmethod by human visual perception.\n","authors":["Chaoqun Zhuang","Yunfei Liu","Sijia Wen","Feng Lu"],"pdf_url":"https://arxiv.org/pdf/2308.00591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04192v3","updated":"2023-08-01T15:05:01Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v3.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00588v1","updated":"2023-08-01T15:04:56Z","published":"2023-08-01T15:04:56Z","title":"Relation-Aware Distribution Representation Network for Person Clustering\n  with Multiple Modalities","summary":"  Person clustering with multi-modal clues, including faces, bodies, and\nvoices, is critical for various tasks, such as movie parsing and identity-based\nmovie editing. Related methods such as multi-view clustering mainly project\nmulti-modal features into a joint feature space. However, multi-modal clue\nfeatures are usually rather weakly correlated due to the semantic gap from the\nmodality-specific uniqueness. As a result, these methods are not suitable for\nperson clustering. In this paper, we propose a Relation-Aware Distribution\nrepresentation Network (RAD-Net) to generate a distribution representation for\nmulti-modal clues. The distribution representation of a clue is a vector\nconsisting of the relation between this clue and all other clues from all\nmodalities, thus being modality agnostic and good for person clustering.\nAccordingly, we introduce a graph-based method to construct distribution\nrepresentation and employ a cyclic update policy to refine distribution\nrepresentation progressively. Our method achieves substantial improvements of\n+6% and +8.2% in F-score on the Video Person-Clustering Dataset (VPCD) and\nVoxCeleb2 multi-view clustering dataset, respectively. Codes will be released\npublicly upon acceptance.\n","authors":["Kaijian Liu","Shixiang Tang","Ziyue Li","Zhishuai Li","Lei Bai","Feng Zhu","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.00588v1.pdf","comment":"Accepted in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2308.00574v1","updated":"2023-08-01T14:35:29Z","published":"2023-08-01T14:35:29Z","title":"PVG: Progressive Vision Graph for Vision Recognition","summary":"  Convolution-based and Transformer-based vision backbone networks process\nimages into the grid or sequence structures, respectively, which are inflexible\nfor capturing irregular objects. Though Vision GNN (ViG) adopts graph-level\nfeatures for complex images, it has some issues, such as inaccurate neighbor\nnode selection, expensive node information aggregation calculation, and\nover-smoothing in the deep layers. To address the above problems, we propose a\nProgressive Vision Graph (PVG) architecture for vision recognition task.\nCompared with previous works, PVG contains three main components: 1)\nProgressively Separated Graph Construction (PSGC) to introduce second-order\nsimilarity by gradually increasing the channel of the global graph branch and\ndecreasing the channel of local branch as the layer deepens; 2) Neighbor nodes\ninformation aggregation and update module by using Max pooling and mathematical\nExpectation (MaxE) to aggregate rich neighbor information; 3) Graph error\nLinear Unit (GraphLU) to enhance low-value information in a relaxed form to\nreduce the compression of image detail information for alleviating the\nover-smoothing. Extensive experiments on mainstream benchmarks demonstrate the\nsuperiority of PVG over state-of-the-art methods, e.g., our PVG-S obtains 83.0%\nTop-1 accuracy on ImageNet-1K that surpasses GNN-based ViG-S by +0.9 with the\nparameters reduced by 18.5%, while the largest PVG-B obtains 84.2% that has\n+0.5 improvement than ViG-B. Furthermore, our PVG-S obtains +1.3 box AP and\n+0.4 mask AP gains than ViG-S on COCO dataset.\n","authors":["Jiafu Wu","Jian Li","Jiangning Zhang","Boshen Zhang","Mingmin Chi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.00574v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.14436v2","updated":"2023-08-01T14:31:52Z","published":"2023-07-26T18:13:16Z","title":"Phenotype-preserving metric design for high-content image reconstruction\n  by generative inpainting","summary":"  In the past decades, automated high-content microscopy demonstrated its\nability to deliver large quantities of image-based data powering the\nversatility of phenotypic drug screening and systems biology applications.\nHowever, as the sizes of image-based datasets grew, it became infeasible for\nhumans to control, avoid and overcome the presence of imaging and sample\npreparation artefacts in the images. While novel techniques like machine\nlearning and deep learning may address these shortcomings through generative\nimage inpainting, when applied to sensitive research data this may come at the\ncost of undesired image manipulation. Undesired manipulation may be caused by\nphenomena such as neural hallucinations, to which some artificial neural\nnetworks are prone. To address this, here we evaluate the state-of-the-art\ninpainting methods for image restoration in a high-content fluorescence\nmicroscopy dataset of cultured cells with labelled nuclei. We show that\narchitectures like DeepFill V2 and Edge Connect can faithfully restore\nmicroscopy images upon fine-tuning with relatively little data. Our results\ndemonstrate that the area of the region to be restored is of higher importance\nthan shape. Furthermore, to control for the quality of restoration, we propose\na novel phenotype-preserving metric design strategy. In this strategy, the size\nand count of the restored biological phenotypes like cell nuclei are quantified\nto penalise undesirable manipulation. We argue that the design principles of\nour approach may also generalise to other applications.\n","authors":["Vaibhav Sharma","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2307.14436v2.pdf","comment":"8 pages, 3 figures, conference proceedings"},{"id":"http://arxiv.org/abs/2308.00549v1","updated":"2023-08-01T13:45:04Z","published":"2023-08-01T13:45:04Z","title":"Copula for Instance-wise Feature Selection and Ranking","summary":"  Instance-wise feature selection and ranking methods can achieve a good\nselection of task-friendly features for each sample in the context of neural\nnetworks. However, existing approaches that assume feature subsets to be\nindependent are imperfect when considering the dependency between features. To\naddress this limitation, we propose to incorporate the Gaussian copula, a\npowerful mathematical technique for capturing correlations between variables,\ninto the current feature selection framework with no additional changes needed.\nExperimental results on both synthetic and real datasets, in terms of\nperformance comparison and interpretability, demonstrate that our method is\ncapable of capturing meaningful correlations.\n","authors":["Hanyu Peng","Guanhua Fang","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2308.00549v1.pdf","comment":"15 pages, UAI poster"},{"id":"http://arxiv.org/abs/2307.08699v2","updated":"2023-08-01T13:41:46Z","published":"2023-07-17T17:58:37Z","title":"Pair then Relation: Pair-Net for Panoptic Scene Graph Generation","summary":"  Panoptic Scene Graph (PSG) is a challenging task in Scene Graph Generation\n(SGG) that aims to create a more comprehensive scene graph representation using\npanoptic segmentation instead of boxes. Compared to SGG, PSG has several\nchallenging problems: pixel-level segment outputs and full relationship\nexploration (It also considers thing and stuff relation). Thus, current PSG\nmethods have limited performance, which hinders downstream tasks or\napplications. The goal of this work aims to design a novel and strong baseline\nfor PSG. To achieve that, we first conduct an in-depth analysis to identify the\nbottleneck of the current PSG models, finding that inter-object pair-wise\nrecall is a crucial factor that was ignored by previous PSG methods. Based on\nthis and the recent query-based frameworks, we present a novel framework: Pair\nthen Relation (Pair-Net), which uses a Pair Proposal Network (PPN) to learn and\nfilter sparse pair-wise relationships between subjects and objects. Moreover,\nwe also observed the sparse nature of object pairs for both Motivated by this,\nwe design a lightweight Matrix Learner within the PPN, which directly learn\npair-wised relationships for pair proposal generation. Through extensive\nablation and analysis, our approach significantly improves upon leveraging the\nsegmenter solid baseline. Notably, our method achieves new state-of-the-art\nresults on the PSG benchmark, with over 10\\% absolute gains compared to\nPSGFormer. The code of this paper is publicly available at\nhttps://github.com/king159/Pair-Net.\n","authors":["Jinghao Wang","Zhengyu Wen","Xiangtai Li","Zujin Guo","Jingkang Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.08699v2.pdf","comment":"Project Page: https://github.com/king159/Pair-Net"},{"id":"http://arxiv.org/abs/2308.00541v1","updated":"2023-08-01T13:36:46Z","published":"2023-08-01T13:36:46Z","title":"Detecting Cloud Presence in Satellite Images Using the RGB-based CLIP\n  Vision-Language Model","summary":"  This work explores capabilities of the pre-trained CLIP vision-language model\nto identify satellite images affected by clouds. Several approaches to using\nthe model to perform cloud presence detection are proposed and evaluated,\nincluding a purely zero-shot operation with text prompts and several\nfine-tuning approaches. Furthermore, the transferability of the methods across\ndifferent datasets and sensor types (Sentinel-2 and Landsat-8) is tested. The\nresults that CLIP can achieve non-trivial performance on the cloud presence\ndetection task with apparent capability to generalise across sensing modalities\nand sensing bands. It is also found that a low-cost fine-tuning stage leads to\na strong increase in true negative rate. The results demonstrate that the\nrepresentations learned by the CLIP model can be useful for satellite image\nprocessing tasks involving clouds.\n","authors":["Mikolaj Czerkawski","Robert Atkinson","Christos Tachtatzis"],"pdf_url":"https://arxiv.org/pdf/2308.00541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00538v1","updated":"2023-08-01T13:31:25Z","published":"2023-08-01T13:31:25Z","title":"PressureTransferNet: Human Attribute Guided Dynamic Ground Pressure\n  Profile Transfer using 3D simulated Pressure Maps","summary":"  We propose PressureTransferNet, a novel method for Human Activity Recognition\n(HAR) using ground pressure information. Our approach generates body-specific\ndynamic ground pressure profiles for specific activities by leveraging existing\npressure data from different individuals. PressureTransferNet is an\nencoder-decoder model taking a source pressure map and a target human attribute\nvector as inputs, producing a new pressure map reflecting the target attribute.\nTo train the model, we use a sensor simulation to create a diverse dataset with\nvarious human attributes and pressure profiles. Evaluation on a real-world\ndataset shows its effectiveness in accurately transferring human attributes to\nground pressure profiles across different scenarios. We visually confirm the\nfidelity of the synthesized pressure shapes using a physics-based deep learning\nmodel and achieve a binary R-square value of 0.79 on areas with ground contact.\nValidation through classification with F1 score (0.911$\\pm$0.015) on physical\npressure mat data demonstrates the correctness of the synthesized pressure\nmaps, making our method valuable for data augmentation, denoising, sensor\nsimulation, and anomaly detection. Applications span sports science,\nrehabilitation, and bio-mechanics, contributing to the development of HAR\nsystems.\n","authors":["Lala Shakti Swarup Ray","Vitor Fortes Rey","Bo Zhou","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2308.00538v1.pdf","comment":"Activity and Behavior Computing 2023"},{"id":"http://arxiv.org/abs/2307.15539v2","updated":"2023-08-01T13:18:18Z","published":"2023-07-28T13:07:42Z","title":"Beating Backdoor Attack at Its Own Game","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not\naffect the network's performance on clean data but would manipulate the network\nbehavior once a trigger pattern is added. Existing defense methods have greatly\nreduced attack success rate, but their prediction accuracy on clean data still\nlags behind a clean model by a large margin. Inspired by the stealthiness and\neffectiveness of backdoor attack, we propose a simple but highly effective\ndefense framework which injects non-adversarial backdoors targeting poisoned\nsamples. Following the general steps in backdoor attack, we detect a small set\nof suspected samples and then apply a poisoning strategy to them. The\nnon-adversarial backdoor, once triggered, suppresses the attacker's backdoor on\npoisoned data, but has limited influence on clean data. The defense can be\ncarried out during data preprocessing, without any modification to the standard\nend-to-end training pipeline. We conduct extensive experiments on multiple\nbenchmarks with different architectures and representative attacks. Results\ndemonstrate that our method achieves state-of-the-art defense effectiveness\nwith by far the lowest performance drop on clean data. Considering the\nsurprising defense ability displayed by our framework, we call for more\nattention to utilizing backdoor for backdoor defense. Code is available at\nhttps://github.com/damianliumin/non-adversarial_backdoor.\n","authors":["Min Liu","Alberto Sangiovanni-Vincentelli","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.15539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00526v1","updated":"2023-08-01T13:09:48Z","published":"2023-08-01T13:09:48Z","title":"Visual attention information can be traced on cortical response but not\n  on the retina: evidence from electrophysiological mouse data using natural\n  images as stimuli","summary":"  Visual attention forms the basis of understanding the visual world. In this\nwork we follow a computational approach to investigate the biological basis of\nvisual attention. We analyze retinal and cortical electrophysiological data\nfrom mouse. Visual Stimuli are Natural Images depicting real world scenes. Our\nresults show that in primary visual cortex (V1), a subset of around $10\\%$ of\nthe neurons responds differently to salient versus non-salient visual regions.\nVisual attention information was not traced in retinal response. It appears\nthat the retina remains naive concerning visual attention; cortical response\ngets modulated to interpret visual attention information. Experimental animal\nstudies may be designed to further explore the biological basis of visual\nattention we traced in this study. In applied and translational science, our\nstudy contributes to the design of improved visual prostheses systems --\nsystems that create artificial visual percepts to visually impaired individuals\nby electronic implants placed on either the retina or the cortex.\n","authors":["Nikos Melanitis","Konstantina Nikita"],"pdf_url":"https://arxiv.org/pdf/2308.00526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00525v1","updated":"2023-08-01T13:07:39Z","published":"2023-08-01T13:07:39Z","title":"Transfer-Ensemble Learning based Deep Convolutional Neural Networks for\n  Diabetic Retinopathy Classification","summary":"  This article aims to classify diabetic retinopathy (DR) disease into five\ndifferent classes using an ensemble approach based on two popular pre-trained\nconvolutional neural networks: VGG16 and Inception V3. The proposed model aims\nto leverage the strengths of the two individual nets to enhance the\nclassification performance for diabetic retinopathy. The ensemble model\narchitecture involves freezing a portion of the layers in each pre-trained\nmodel to utilize their learned representations effectively. Global average\npooling layers are added to transform the output feature maps into fixed-length\nvectors. These vectors are then concatenated to form a consolidated\nrepresentation of the input image. The ensemble model is trained using a\ndataset of diabetic retinopathy images (APTOS), divided into training and\nvalidation sets. During the training process, the model learns to classify the\nretinal images into the corresponding diabetic retinopathy classes.\nExperimental results on the test set demonstrate the efficacy of the proposed\nensemble model for DR classification achieving an accuracy of 96.4%.\n","authors":["Susmita Ghosh","Abhiroop Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2308.00525v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00520v1","updated":"2023-08-01T12:59:33Z","published":"2023-08-01T12:59:33Z","title":"NormKD: Normalized Logits for Knowledge Distillation","summary":"  Logit based knowledge distillation gets less attention in recent years since\nfeature based methods perform better in most cases. Nevertheless, we find it\nstill has untapped potential when we re-investigate the temperature, which is a\ncrucial hyper-parameter to soften the logit outputs. For most of the previous\nworks, it was set as a fixed value for the entire distillation procedure.\nHowever, as the logits from different samples are distributed quite variously,\nit is not feasible to soften all of them to an equal degree by just a single\ntemperature, which may make the previous work transfer the knowledge of each\nsample inadequately. In this paper, we restudy the hyper-parameter temperature\nand figure out its incapability to distill the knowledge from each sample\nsufficiently when it is a single value. To address this issue, we propose\nNormalized Knowledge Distillation (NormKD), with the purpose of customizing the\ntemperature for each sample according to the characteristic of the sample's\nlogit distribution. Compared to the vanilla KD, NormKD barely has extra\ncomputation or storage cost but performs significantly better on CIRAR-100 and\nImageNet for image classification. Furthermore, NormKD can be easily applied to\nthe other logit based methods and achieve better performance which can be\ncloser to or even better than the feature based method.\n","authors":["Zhihao Chi","Tu Zheng","Hengjia Li","Zheng Yang","Boxi Wu","Binbin Lin","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2308.00520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00519v1","updated":"2023-08-01T12:59:07Z","published":"2023-08-01T12:59:07Z","title":"Markerless human pose estimation for biomedical applications: a survey","summary":"  Markerless Human Pose Estimation (HPE) proved its potential to support\ndecision making and assessment in many fields of application. HPE is often\npreferred to traditional marker-based Motion Capture systems due to the ease of\nsetup, portability, and affordable cost of the technology. However, the\nexploitation of HPE in biomedical applications is still under investigation.\nThis review aims to provide an overview of current biomedical applications of\nHPE. In this paper, we examine the main features of HPE approaches and discuss\nwhether or not those features are of interest to biomedical applications. We\nalso identify those areas where HPE is already in use and present peculiarities\nand trends followed by researchers and practitioners. We include here 25\napproaches to HPE and more than 40 studies of HPE applied to motor development\nassessment, neuromuscolar rehabilitation, and gait & posture analysis. We\nconclude that markerless HPE offers great potential for extending diagnosis and\nrehabilitation outside hospitals and clinics, toward the paradigm of remote\nmedical care.\n","authors":["Andrea Avogaro","Federico Cunico","Bodo Rosenhahn","Francesco Setti"],"pdf_url":"https://arxiv.org/pdf/2308.00519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06321v2","updated":"2023-08-01T12:57:14Z","published":"2023-05-10T17:15:09Z","title":"SepMark: Deep Separable Watermarking for Unified Source Tracing and\n  Deepfake Detection","summary":"  Malicious Deepfakes have led to a sharp conflict over distinguishing between\ngenuine and forged faces. Although many countermeasures have been developed to\ndetect Deepfakes ex-post, undoubtedly, passive forensics has not considered any\npreventive measures for the pristine face before foreseeable manipulations. To\ncomplete this forensics ecosystem, we thus put forward the proactive solution\ndubbed SepMark, which provides a unified framework for source tracing and\nDeepfake detection. SepMark originates from encoder-decoder-based deep\nwatermarking but with two separable decoders. For the first time the deep\nseparable watermarking, SepMark brings a new paradigm to the established study\nof deep watermarking, where a single encoder embeds one watermark elegantly,\nwhile two decoders can extract the watermark separately at different levels of\nrobustness. The robust decoder termed Tracer that resists various distortions\nmay have an overly high level of robustness, allowing the watermark to survive\nboth before and after Deepfake. The semi-robust one termed Detector is\nselectively sensitive to malicious distortions, making the watermark disappear\nafter Deepfake. Only SepMark comprising of Tracer and Detector can reliably\ntrace the trusted source of the marked face and detect whether it has been\naltered since being marked; neither of the two alone can achieve this.\nExtensive experiments demonstrate the effectiveness of the proposed SepMark on\ntypical Deepfakes, including face swapping, expression reenactment, and\nattribute editing.\n","authors":["Xiaoshuai Wu","Xin Liao","Bo Ou"],"pdf_url":"https://arxiv.org/pdf/2305.06321v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.00508v1","updated":"2023-08-01T12:46:58Z","published":"2023-08-01T12:46:58Z","title":"Relational Contrastive Learning for Scene Text Recognition","summary":"  Context-aware methods achieved great success in supervised scene text\nrecognition via incorporating semantic priors from words. We argue that such\nprior contextual information can be interpreted as the relations of textual\nprimitives due to the heterogeneous text and background, which can provide\neffective self-supervised labels for representation learning. However, textual\nrelations are restricted to the finite size of dataset due to lexical\ndependencies, which causes the problem of over-fitting and compromises\nrepresentation robustness. To this end, we propose to enrich the textual\nrelations via rearrangement, hierarchy and interaction, and design a unified\nframework called RCLSTR: Relational Contrastive Learning for Scene Text\nRecognition. Based on causality, we theoretically explain that three modules\nsuppress the bias caused by the contextual prior and thus guarantee\nrepresentation robustness. Experiments on representation quality show that our\nmethod outperforms state-of-the-art self-supervised STR methods. Code is\navailable at https://github.com/ThunderVVV/RCLSTR.\n","authors":["Jinglei Zhang","Tiancheng Lin","Yi Xu","Kai Chen","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00508v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.00507v1","updated":"2023-08-01T12:46:02Z","published":"2023-08-01T12:46:02Z","title":"Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT\n  by Integrating Neural Distance and Texture-Aware Transformer","summary":"  Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which\nthe tumor-vascular involvement greatly affects the resectability and, thus,\noverall survival of patients. However, current prognostic prediction methods\nfail to explicitly and accurately investigate relationships between the tumor\nand nearby important vessels. This paper proposes a novel learnable neural\ndistance that describes the precise relationship between the tumor and vessels\nin CT images of different patients, adopting it as a major feature for\nprognosis prediction. Besides, different from existing models that used CNNs or\nLSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT\nimaging, we improved the extraction of dynamic tumor-related texture features\nin multi-phase contrast-enhanced CT by fusing local and global features using\nCNN and transformer modules, further enhancing the features extracted across\nmulti-phase CT images. We extensively evaluated and compared the proposed\nmethod with existing methods in the multi-center (n=4) dataset with 1,070\npatients with PDAC, and statistical analysis confirmed its clinical\neffectiveness in the external test set consisting of three centers. The\ndeveloped risk marker was the strongest predictor of overall survival among\npreoperative factors and it has the potential to be combined with established\nclinical factors to select patients at higher risk who might benefit from\nneoadjuvant therapy.\n","authors":["Hexin Dong","Jiawen Yao","Yuxing Tang","Mingze Yuan","Yingda Xia","Jian Zhou","Hong Lu","Jingren Zhou","Bin Dong","Le Lu","Li Zhang","Zaiyi Liu","Yu Shi","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00507v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.00491v1","updated":"2023-08-01T12:22:58Z","published":"2023-08-01T12:22:58Z","title":"An L2-Normalized Spatial Attention Network For Accurate And Fast\n  Classification Of Brain Tumors In 2D T1-Weighted CE-MRI Images","summary":"  We propose an accurate and fast classification network for classification of\nbrain tumors in MRI images that outperforms all lightweight methods\ninvestigated in terms of accuracy. We test our model on a challenging 2D\nT1-weighted CE-MRI dataset containing three types of brain tumors: Meningioma,\nGlioma and Pituitary. We introduce an l2-normalized spatial attention mechanism\nthat acts as a regularizer against overfitting during training. We compare our\nresults against the state-of-the-art on this dataset and show that by\nintegrating l2-normalized spatial attention into a baseline network we achieve\na performance gain of 1.79 percentage points. Even better accuracy can be\nattained by combining our model in an ensemble with the pretrained VGG16 at the\nexpense of execution speed. Our code is publicly available at\nhttps://github.com/juliadietlmeier/MRI_image_classification\n","authors":["Grace Billingsley","Julia Dietlmeier","Vivek Narayanaswamy","Andreas Spanias","Noel E. OConnor"],"pdf_url":"https://arxiv.org/pdf/2308.00491v1.pdf","comment":"Accepted to be published in: IEEE International Conference on Image\n  Processing (ICIP), Kuala Lumpur October 8-11, 2023"},{"id":"http://arxiv.org/abs/2308.00475v1","updated":"2023-08-01T11:58:49Z","published":"2023-08-01T11:58:49Z","title":"DINO-CXR: A self supervised method based on vision transformer for chest\n  X-ray classification","summary":"  The limited availability of labeled chest X-ray datasets is a significant\nbottleneck in the development of medical imaging methods. Self-supervised\nlearning (SSL) can mitigate this problem by training models on unlabeled data.\nFurthermore, self-supervised pretraining has yielded promising results in\nvisual recognition of natural images but has not been given much consideration\nin medical image analysis. In this work, we propose a self-supervised method,\nDINO-CXR, which is a novel adaptation of a self-supervised method, DINO, based\non a vision transformer for chest X-ray classification. A comparative analysis\nis performed to show the effectiveness of the proposed method for both\npneumonia and COVID-19 detection. Through a quantitative analysis, it is also\nshown that the proposed method outperforms state-of-the-art methods in terms of\naccuracy and achieves comparable results in terms of AUC and F-1 score while\nrequiring significantly less labeled data.\n","authors":["Mohammadreza Shakouri","Fatemeh Iranmanesh","Mahdi Eftekhari"],"pdf_url":"https://arxiv.org/pdf/2308.00475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00473v1","updated":"2023-08-01T11:54:34Z","published":"2023-08-01T11:54:34Z","title":"Is Last Layer Re-Training Truly Sufficient for Robustness to Spurious\n  Correlations?","summary":"  Models trained with empirical risk minimization (ERM) are known to learn to\nrely on spurious features, i.e., their prediction is based on undesired\nauxiliary features which are strongly correlated with class labels but lack\ncausal reasoning. This behavior particularly degrades accuracy in groups of\nsamples of the correlated class that are missing the spurious feature or\nsamples of the opposite class but with the spurious feature present. The\nrecently proposed Deep Feature Reweighting (DFR) method improves accuracy of\nthese worst groups. Based on the main argument that ERM mods can learn core\nfeatures sufficiently well, DFR only needs to retrain the last layer of the\nclassification model with a small group-balanced data set. In this work, we\nexamine the applicability of DFR to realistic data in the medical domain.\nFurthermore, we investigate the reasoning behind the effectiveness of\nlast-layer retraining and show that even though DFR has the potential to\nimprove the accuracy of the worst group, it remains susceptible to spurious\ncorrelations.\n","authors":["Phuong Quynh Le","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2308.00473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00471v1","updated":"2023-08-01T11:49:05Z","published":"2023-08-01T11:49:05Z","title":"A Deep Learning Approach for Virtual Contrast Enhancement in Contrast\n  Enhanced Spectral Mammography","summary":"  Contrast Enhanced Spectral Mammography (CESM) is a dual-energy mammographic\nimaging technique that first needs intravenously administration of an iodinated\ncontrast medium; then, it collects bot a low-energy image, comparable to\nstandard mammography, and a high-energy image. The two scans are then combined\nto get a recombined image showing contrast enhancement. Despite CESM diagnostic\nadvantages for breast cancer diagnosis, the use of contrast medium can cause\nside effects, and CESM also beams patients with a higher radiation dose\ncompared to standard mammography. To address these limitations this work\nproposes to use deep generative models for virtual contrast enhancement on\nCESM, aiming to make the CESM contrast-free as well as to reduce the radiation\ndose. Our deep networks, consisting of an autoencoder and two Generative\nAdversarial Networks, the Pix2Pix, and the CycleGAN, generate synthetic\nrecombined images solely from low-energy images. We perform an extensive\nquantitative and qualitative analysis of the model's performance, also\nexploiting radiologists' assessments, on a novel CESM dataset that includes\n1138 images that, as a further contribution of this work, we make publicly\navailable. The results show that CycleGAN is the most promising deep network to\ngenerate synthetic recombined images, highlighting the potential of artificial\nintelligence techniques for virtual contrast enhancement in this field.\n","authors":["Aurora Rofena","Valerio Guarrasi","Marina Sarli","Claudia Lucia Piccolo","Matteo Sammarra","Bruno Beomonte Zobel","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2308.00471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00465v1","updated":"2023-08-01T11:40:19Z","published":"2023-08-01T11:40:19Z","title":"A Satellite Imagery Dataset for Long-Term Sustainable Development in\n  United States Cities","summary":"  Cities play an important role in achieving sustainable development goals\n(SDGs) to promote economic growth and meet social needs. Especially satellite\nimagery is a potential data source for studying sustainable urban development.\nHowever, a comprehensive dataset in the United States (U.S.) covering multiple\ncities, multiple years, multiple scales, and multiple indicators for SDG\nmonitoring is lacking. To support the research on SDGs in U.S. cities, we\ndevelop a satellite imagery dataset using deep learning models for five SDGs\ncontaining 25 sustainable development indicators. The proposed dataset covers\nthe 100 most populated U.S. cities and corresponding Census Block Groups from\n2014 to 2023. Specifically, we collect satellite imagery and identify objects\nwith state-of-the-art object detection and semantic segmentation models to\nobserve cities' bird's-eye view. We further gather population, nighttime light,\nsurvey, and built environment data to depict SDGs regarding poverty, health,\neducation, inequality, and living environment. We anticipate the dataset to\nhelp urban policymakers and researchers to advance SDGs-related studies,\nespecially applying satellite imagery to monitor long-term and multi-scale SDGs\nin cities.\n","authors":["Yanxin Xi","Yu Liu","Tong Li","Jintao Ding","Yunke Zhang","Sasu Tarkoma","Yong Li","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2308.00465v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.00458v1","updated":"2023-08-01T11:22:51Z","published":"2023-08-01T11:22:51Z","title":"Center Contrastive Loss for Metric Learning","summary":"  Contrastive learning is a major studied topic in metric learning. However,\nsampling effective contrastive pairs remains a challenge due to factors such as\nlimited batch size, imbalanced data distribution, and the risk of overfitting.\nIn this paper, we propose a novel metric learning function called Center\nContrastive Loss, which maintains a class-wise center bank and compares the\ncategory centers with the query data points using a contrastive loss. The\ncenter bank is updated in real-time to boost model convergence without the need\nfor well-designed sample mining. The category centers are well-optimized\nclassification proxies to re-balance the supervisory signal of each class.\nFurthermore, the proposed loss combines the advantages of both contrastive and\nclassification methods by reducing intra-class variations and enhancing\ninter-class differences to improve the discriminative power of embeddings. Our\nexperimental results, as shown in Figure 1, demonstrate that a standard network\n(ResNet50) trained with our loss achieves state-of-the-art performance and\nfaster convergence.\n","authors":["Bolun Cai","Pengfei Xiong","Shangxuan Tian"],"pdf_url":"https://arxiv.org/pdf/2308.00458v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.00454v1","updated":"2023-08-01T11:10:33Z","published":"2023-08-01T11:10:33Z","title":"ViT2EEG: Leveraging Hybrid Pretrained Vision Transformers for EEG Data","summary":"  In this study, we demonstrate the application of a hybrid Vision Transformer\n(ViT) model, pretrained on ImageNet, on an electroencephalogram (EEG)\nregression task. Despite being originally trained for image classification\ntasks, when fine-tuned on EEG data, this model shows a notable increase in\nperformance compared to other models, including an identical architecture ViT\ntrained without the ImageNet weights. This discovery challenges the traditional\nunderstanding of model generalization, suggesting that Transformer models\npretrained on seemingly unrelated image data can provide valuable priors for\nEEG regression tasks with an appropriate fine-tuning pipeline.\n  The success of this approach suggests that the features extracted by ViT\nmodels in the context of visual tasks can be readily transformed for the\npurpose of EEG predictive modeling. We recommend utilizing this methodology not\nonly in neuroscience and related fields, but generally for any task where data\ncollection is limited by practical, financial, or ethical constraints. Our\nresults illuminate the potential of pretrained models on tasks that are clearly\ndistinct from their original purpose.\n","authors":["Ruiqi Yang","Eric Modesitt"],"pdf_url":"https://arxiv.org/pdf/2308.00454v1.pdf","comment":"8 pages, 6 for article, 1 for citation, 1 for appendix. Accepted to\n  KDD-UC 2023"},{"id":"http://arxiv.org/abs/2308.00452v1","updated":"2023-08-01T11:05:13Z","published":"2023-08-01T11:05:13Z","title":"A Majority Invariant Approach to Patch Robustness Certification for Deep\n  Learning Models","summary":"  Patch robustness certification ensures no patch within a given bound on a\nsample can manipulate a deep learning model to predict a different label.\nHowever, existing techniques cannot certify samples that cannot meet their\nstrict bars at the classifier or patch region levels. This paper proposes\nMajorCert. MajorCert firstly finds all possible label sets manipulatable by the\nsame patch region on the same sample across the underlying classifiers, then\nenumerates their combinations element-wise, and finally checks whether the\nmajority invariant of all these combinations is intact to certify samples.\n","authors":["Qilin Zhou","Zhengyuan Wei","Haipeng Wang","W. K. Chan"],"pdf_url":"https://arxiv.org/pdf/2308.00452v1.pdf","comment":"5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track"},{"id":"http://arxiv.org/abs/2308.00451v1","updated":"2023-08-01T11:01:17Z","published":"2023-08-01T11:01:17Z","title":"Physics-Driven Spectrum-Consistent Federated Learning for Palmprint\n  Verification","summary":"  Palmprint as biometrics has gained increasing attention recently due to its\ndiscriminative ability and robustness. However, existing methods mainly improve\npalmprint verification within one spectrum, which is challenging to verify\nacross different spectrums. Additionally, in distributed server-client-based\ndeployment, palmprint verification systems predominantly necessitate clients to\ntransmit private data for model training on the centralized server, thereby\nengendering privacy apprehensions. To alleviate the above issues, in this\npaper, we propose a physics-driven spectrum-consistent federated learning\nmethod for palmprint verification, dubbed as PSFed-Palm. PSFed-Palm draws upon\nthe inherent physical properties of distinct wavelength spectrums, wherein\nimages acquired under similar wavelengths display heightened resemblances. Our\napproach first partitions clients into short- and long-spectrum groups\naccording to the wavelength range of their local spectrum images. Subsequently,\nwe introduce anchor models for short- and long-spectrum, which constrain the\noptimization directions of local models associated with long- and\nshort-spectrum images. Specifically, a spectrum-consistent loss that enforces\nthe model parameters and feature representation to align with their\ncorresponding anchor models is designed. Finally, we impose constraints on the\nlocal models to ensure their consistency with the global model, effectively\npreventing model drift. This measure guarantees spectrum consistency while\nprotecting data privacy, as there is no need to share local data. Extensive\nexperiments are conducted to validate the efficacy of our proposed PSFed-Palm\napproach. The proposed PSFed-Palm demonstrates compelling performance despite\nonly a limited number of training data. The codes will be released at\nhttps://github.com/Zi-YuanYang/PSFed-Palm.\n","authors":["Ziyuan Yang","Andrew Beng Jin Teoh","Bob Zhang","Lu Leng","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13929v2","updated":"2023-08-01T10:47:23Z","published":"2023-07-26T03:00:31Z","title":"Spatio-Temporal Domain Awareness for Multi-Agent Collaborative\n  Perception","summary":"  Multi-agent collaborative perception as a potential application for\nvehicle-to-everything communication could significantly improve the perception\nperformance of autonomous vehicles over single-agent perception. However,\nseveral challenges remain in achieving pragmatic information sharing in this\nemerging research. In this paper, we propose SCOPE, a novel collaborative\nperception framework that aggregates the spatio-temporal awareness\ncharacteristics across on-road agents in an end-to-end manner. Specifically,\nSCOPE has three distinct strengths: i) it considers effective semantic cues of\nthe temporal context to enhance current representations of the target agent;\nii) it aggregates perceptually critical spatial information from heterogeneous\nagents and overcomes localization errors via multi-scale feature interactions;\niii) it integrates multi-source representations of the target agent based on\ntheir complementary contributions by an adaptive fusion paradigm. To thoroughly\nevaluate SCOPE, we consider both real-world and simulated scenarios of\ncollaborative 3D object detection tasks on three datasets. Extensive\nexperiments demonstrate the superiority of our approach and the necessity of\nthe proposed components.\n","authors":["Kun Yang","Dingkang Yang","Jingyu Zhang","Mingcheng Li","Yang Liu","Jing Liu","Hanqi Wang","Peng Sun","Liang Song"],"pdf_url":"https://arxiv.org/pdf/2307.13929v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00442v1","updated":"2023-08-01T10:37:12Z","published":"2023-08-01T10:37:12Z","title":"FLatten Transformer: Vision Transformer using Focused Linear Attention","summary":"  The quadratic computation complexity of self-attention has been a persistent\nchallenge when applying Transformer models to vision tasks. Linear attention,\non the other hand, offers a much more efficient alternative with its linear\ncomplexity by approximating the Softmax operation through carefully designed\nmapping functions. However, current linear attention approaches either suffer\nfrom significant performance degradation or introduce additional computation\noverhead from the mapping functions. In this paper, we propose a novel Focused\nLinear Attention module to achieve both high efficiency and expressiveness.\nSpecifically, we first analyze the factors contributing to the performance\ndegradation of linear attention from two perspectives: the focus ability and\nfeature diversity. To overcome these limitations, we introduce a simple yet\neffective mapping function and an efficient rank restoration module to enhance\nthe expressiveness of self-attention while maintaining low computation\ncomplexity. Extensive experiments show that our linear attention module is\napplicable to a variety of advanced vision Transformers, and achieves\nconsistently improved performances on multiple benchmarks. Code is available at\nhttps://github.com/LeapLabTHU/FLatten-Transformer.\n","authors":["Dongchen Han","Xuran Pan","Yizeng Han","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.00442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00429v1","updated":"2023-08-01T10:15:15Z","published":"2023-08-01T10:15:15Z","title":"Patch-wise Auto-Encoder for Visual Anomaly Detection","summary":"  Anomaly detection without priors of the anomalies is challenging. In the\nfield of unsupervised anomaly detection, traditional auto-encoder (AE) tends to\nfail based on the assumption that by training only on normal images, the model\nwill not be able to reconstruct abnormal images correctly. On the contrary, we\npropose a novel patch-wise auto-encoder (Patch AE) framework, which aims at\nenhancing the reconstruction ability of AE to anomalies instead of weakening\nit. Each patch of image is reconstructed by corresponding spatially distributed\nfeature vector of the learned feature representation, i.e., patch-wise\nreconstruction, which ensures anomaly-sensitivity of AE. Our method is simple\nand efficient. It advances the state-of-the-art performances on Mvtec AD\nbenchmark, which proves the effectiveness of our model. It shows great\npotential in practical industrial application scenarios.\n","authors":["Yajie Cui","Zhaoxiang Liu","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2308.00429v1.pdf","comment":"ICIP2023 accepted"},{"id":"http://arxiv.org/abs/2308.00428v1","updated":"2023-08-01T10:14:43Z","published":"2023-08-01T10:14:43Z","title":"Multiscale Global and Regional Feature Learning Using Co-Tuplet Loss for\n  Offline Handwritten Signature Verification","summary":"  Handwritten signature verification is a significant biometric verification\nmethod widely acknowledged by legal and financial institutions. However, the\ndevelopment of automatic signature verification systems poses challenges due to\ninter-writer similarity, intra-writer variations, and the limited number of\nsignature samples. To address these challenges, we propose a multiscale global\nand regional feature learning network (MGRNet) with the co-tuplet loss, a new\nmetric learning loss, for offline handwritten signature verification. MGRNet\njointly learns global and regional information from various spatial scales and\nintegrates it to generate discriminative features. Consequently, it can capture\noverall signature stroke information while detecting detailed local differences\nbetween genuine and skilled-forged signatures. To enhance the discriminative\ncapability of our network further, we propose the co-tuplet loss, which\nsimultaneously considers multiple positive and negative examples to learn\ndistance metrics. By dealing with inter-writer similarity and intra-writer\nvariations and focusing on informative examples, the co-tuplet loss addresses\nthe limitations of typical metric learning losses. Additionally, we develop\nHanSig, a large-scale Chinese signature dataset, to facilitate the development\nof robust systems for this script. The dataset is available at\nhttps://github.com/ashleyfhh/HanSig. Experimental results on four benchmark\ndatasets in different languages demonstrate the promising performance of our\nmethod in comparison to state-of-the-art approaches.\n","authors":["Fu-Hsien Huang","Hsin-Min Lu"],"pdf_url":"https://arxiv.org/pdf/2308.00428v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2305.10807v3","updated":"2023-08-01T10:12:42Z","published":"2023-05-18T08:40:34Z","title":"Transformer-based Variable-rate Image Compression with\n  Region-of-interest Control","summary":"  This paper proposes a transformer-based learned image compression system. It\nis capable of achieving variable-rate compression with a single model while\nsupporting the region-of-interest (ROI) functionality. Inspired by prompt\ntuning, we introduce prompt generation networks to condition the\ntransformer-based autoencoder of compression. Our prompt generation networks\ngenerate content-adaptive tokens according to the input image, an ROI mask, and\na rate parameter. The separation of the ROI mask and the rate parameter allows\nan intuitive way to achieve variable-rate and ROI coding simultaneously.\nExtensive experiments validate the effectiveness of our proposed method and\nconfirm its superiority over the other competing methods.\n","authors":["Chia-Hao Kao","Ying-Chieh Weng","Yi-Hsin Chen","Wei-Chen Chiu","Wen-Hsiao Peng"],"pdf_url":"https://arxiv.org/pdf/2305.10807v3.pdf","comment":"Accepted to IEEE ICIP 2023"},{"id":"http://arxiv.org/abs/2302.12189v2","updated":"2023-08-01T09:53:21Z","published":"2023-02-23T17:30:18Z","title":"HL Dataset: Visually-grounded Description of Scenes, Actions and\n  Rationales","summary":"  Current captioning datasets focus on object-centric captions, describing the\nvisible objects in the image, e.g. \"people eating food in a park\". Although\nthese datasets are useful to evaluate the ability of Vision & Language models\nto recognize and describe visual content, they do not support controlled\nexperiments involving model testing or fine-tuning, with more high-level\ncaptions, which humans find easy and natural to produce. For example, people\noften describe images based on the type of scene they depict ('people at a\nholiday resort') and the actions they perform ('people having a picnic'). Such\ndescriptions draw on personal experience and commonsense assumptions. We\npresent the High-Level Dataset a dataset extending 14997 images from the COCO\ndataset, aligned with a new set of 134,973 human-annotated (high-level)\ncaptions collected along three axes: scenes, actions, and rationales. We\nfurther extend this dataset with confidence scores collected from an\nindependent set of readers, as well as a set of narrative captions generated\nsynthetically, by combining each of the three axes. We describe this dataset\nand analyse it extensively. We also present baseline results for the High-Level\nCaptioning task.\n","authors":["Michele Cafagna","Kees van Deemter","Albert Gatt"],"pdf_url":"https://arxiv.org/pdf/2302.12189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00408v1","updated":"2023-08-01T09:38:41Z","published":"2023-08-01T09:38:41Z","title":"Space Debris: Are Deep Learning-based Image Enhancements part of the\n  Solution?","summary":"  The volume of space debris currently orbiting the Earth is reaching an\nunsustainable level at an accelerated pace. The detection, tracking,\nidentification, and differentiation between orbit-defined, registered\nspacecraft, and rogue/inactive space ``objects'', is critical to asset\nprotection. The primary objective of this work is to investigate the validity\nof Deep Neural Network (DNN) solutions to overcome the limitations and image\nartefacts most prevalent when captured with monocular cameras in the visible\nlight spectrum. In this work, a hybrid UNet-ResNet34 Deep Learning (DL)\narchitecture pre-trained on the ImageNet dataset, is developed. Image\ndegradations addressed include blurring, exposure issues, poor contrast, and\nnoise. The shortage of space-generated data suitable for supervised DL is also\naddressed. A visual comparison between the URes34P model developed in this work\nand the existing state of the art in deep learning image enhancement methods,\nrelevant to images captured in space, is presented. Based upon visual\ninspection, it is determined that our UNet model is capable of correcting for\nspace-related image degradations and merits further investigation to reduce its\ncomputational complexity.\n","authors":["Michele Jamrozik","Vincent Gaudillière","Mohamed Adel Musallam","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2308.00408v1.pdf","comment":"Presented in ISCS23"},{"id":"http://arxiv.org/abs/2307.13933v2","updated":"2023-08-01T09:29:51Z","published":"2023-07-26T03:12:05Z","title":"AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for\n  Assistive Driving Perception","summary":"  Driver distraction has become a significant cause of severe traffic accidents\nover the past decade. Despite the growing development of vision-driven driver\nmonitoring systems, the lack of comprehensive perception datasets restricts\nroad safety and traffic security. In this paper, we present an AssIstive\nDriving pErception dataset (AIDE) that considers context information both\ninside and outside the vehicle in naturalistic scenarios. AIDE facilitates\nholistic driver monitoring through three distinctive characteristics, including\nmulti-view settings of driver and scene, multi-modal annotations of face, body,\nposture, and gesture, and four pragmatic task designs for driving\nunderstanding. To thoroughly explore AIDE, we provide experimental benchmarks\non three kinds of baseline frameworks via extensive methods. Moreover, two\nfusion strategies are introduced to give new insights into learning effective\nmulti-stream/modal representations. We also systematically investigate the\nimportance and rationality of the key components in AIDE and benchmarks. The\nproject link is https://github.com/ydk122024/AIDE.\n","authors":["Dingkang Yang","Shuai Huang","Zhi Xu","Zhenpeng Li","Shunli Wang","Mingcheng Li","Yuzheng Wang","Yang Liu","Kun Yang","Zhaoyu Chen","Yan Wang","Jing Liu","Peixuan Zhang","Peng Zhai","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13933v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00402v1","updated":"2023-08-01T09:29:39Z","published":"2023-08-01T09:29:39Z","title":"Metrics to Quantify Global Consistency in Synthetic Medical Images","summary":"  Image synthesis is increasingly being adopted in medical image processing,\nfor example for data augmentation or inter-modality image translation. In these\ncritical applications, the generated images must fulfill a high standard of\nbiological correctness. A particular requirement for these images is global\nconsistency, i.e an image being overall coherent and structured so that all\nparts of the image fit together in a realistic and meaningful way. Yet,\nestablished image quality metrics do not explicitly quantify this property of\nsynthetic images. In this work, we introduce two metrics that can measure the\nglobal consistency of synthetic images on a per-image basis. To measure the\nglobal consistency, we presume that a realistic image exhibits consistent\nproperties, e.g., a person's body fat in a whole-body MRI, throughout the\ndepicted object or scene. Hence, we quantify global consistency by predicting\nand comparing explicit attributes of images on patches using supervised trained\nneural networks. Next, we adapt this strategy to an unlabeled setting by\nmeasuring the similarity of implicit image features predicted by a\nself-supervised trained network. Our results demonstrate that predicting\nexplicit attributes of synthetic images on patches can distinguish globally\nconsistent from inconsistent images. Implicit representations of images are\nless sensitive to assess global consistency but are still serviceable when\nlabeled data is unavailable. Compared to established metrics, such as the FID,\nour method can explicitly measure global consistency on a per-image basis,\nenabling a dedicated analysis of the biological plausibility of single\nsynthetic images.\n","authors":["Daniel Scholz","Benedikt Wiestler","Daniel Rueckert","Martin J. Menten"],"pdf_url":"https://arxiv.org/pdf/2308.00402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00401v1","updated":"2023-08-01T09:28:48Z","published":"2023-08-01T09:28:48Z","title":"VideoPro: A Visual Analytics Approach for Interactive Video Programming","summary":"  Constructing supervised machine learning models for real-world video analysis\nrequire substantial labeled data, which is costly to acquire due to scarce\ndomain expertise and laborious manual inspection. While data programming shows\npromise in generating labeled data at scale with user-defined labeling\nfunctions, the high dimensional and complex temporal information in videos\nposes additional challenges for effectively composing and evaluating labeling\nfunctions. In this paper, we propose VideoPro, a visual analytics approach to\nsupport flexible and scalable video data programming for model steering with\nreduced human effort. We first extract human-understandable events from videos\nusing computer vision techniques and treat them as atomic components of\nlabeling functions. We further propose a two-stage template mining algorithm\nthat characterizes the sequential patterns of these events to serve as labeling\nfunction templates for efficient data labeling. The visual interface of\nVideoPro facilitates multifaceted exploration, examination, and application of\nthe labeling templates, allowing for effective programming of video data at\nscale. Moreover, users can monitor the impact of programming on model\nperformance and make informed adjustments during the iterative programming\nprocess. We demonstrate the efficiency and effectiveness of our approach with\ntwo case studies and expert interviews.\n","authors":["Jianben He","Xingbo Wang","Kam Kwai Wong","Xijie Huang","Changjian Chen","Zixin Chen","Fengjie Wang","Min Zhu","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2308.00401v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2203.03971v2","updated":"2023-08-01T09:21:58Z","published":"2022-03-08T09:58:40Z","title":"Universal Prototype Transport for Zero-Shot Action Recognition and\n  Localization","summary":"  This work addresses the problem of recognizing action categories in videos\nwhen no training examples are available. The current state-of-the-art enables\nsuch a zero-shot recognition by learning universal mappings from videos to a\nsemantic space, either trained on large-scale seen actions or on objects. While\neffective, we find that universal action and object mappings are biased to\nspecific regions in the semantic space. These biases lead to a fundamental\nproblem: many unseen action categories are simply never inferred during\ntesting. For example on UCF-101, a quarter of the unseen actions are out of\nreach with a state-of-the-art universal action model. To that end, this paper\nintroduces universal prototype transport for zero-shot action recognition. The\nmain idea is to re-position the semantic prototypes of unseen actions by\nmatching them to the distribution of all test videos. For universal action\nmodels, we propose to match distributions through a hyperspherical optimal\ntransport from unseen action prototypes to the set of all projected test\nvideos. The resulting transport couplings in turn determine the target\nprototype for each unseen action. Rather than directly using the target\nprototype as final result, we re-position unseen action prototypes along the\ngeodesic spanned by the original and target prototypes as a form of semantic\nregularization. For universal object models, we outline a variant that defines\ntarget prototypes based on an optimal transport between unseen action\nprototypes and object prototypes. Empirically, we show that universal prototype\ntransport diminishes the biased selection of unseen action prototypes and\nboosts both universal action and object models for zero-shot classification and\nspatio-temporal localization.\n","authors":["Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2203.03971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00398v1","updated":"2023-08-01T09:21:53Z","published":"2023-08-01T09:21:53Z","title":"DriveAdapter: Breaking the Coupling Barrier of Perception and Planning\n  in End-to-End Autonomous Driving","summary":"  End-to-end autonomous driving aims to build a fully differentiable system\nthat takes raw sensor data as inputs and directly outputs the planned\ntrajectory or control signals of the ego vehicle. State-of-the-art methods\nusually follow the `Teacher-Student' paradigm. The Teacher model uses\nprivileged information (ground-truth states of surrounding agents and map\nelements) to learn the driving strategy. The student model only has access to\nraw sensor data and conducts behavior cloning on the data collected by the\nteacher model. By eliminating the noise of the perception part during planning\nlearning, state-of-the-art works could achieve better performance with\nsignificantly less data compared to those coupled ones.\n  However, under the current Teacher-Student paradigm, the student model still\nneeds to learn a planning head from scratch, which could be challenging due to\nthe redundant and noisy nature of raw sensor inputs and the casual confusion\nissue of behavior cloning. In this work, we aim to explore the possibility of\ndirectly adopting the strong teacher model to conduct planning while letting\nthe student model focus more on the perception part. We find that even equipped\nwith a SOTA perception model, directly letting the student model learn the\nrequired inputs of the teacher model leads to poor driving performance, which\ncomes from the large distribution gap between predicted privileged inputs and\nthe ground-truth.\n  To this end, we propose DriveAdapter, which employs adapters with the feature\nalignment objective function between the student (perception) and teacher\n(planning) modules. Additionally, since the pure learning-based teacher model\nitself is imperfect and occasionally breaks safety rules, we propose a method\nof action-guided feature learning with a mask for those imperfect teacher\nfeatures to further inject the priors of hand-crafted rules into the learning\nprocess.\n","authors":["Xiaosong Jia","Yulu Gao","Li Chen","Junchi Yan","Patrick Langechuan Liu","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2308.00398v1.pdf","comment":"Accepted by ICCV 2023. Code url:\n  https://github.com/OpenDriveLab/DriveAdapter"},{"id":"http://arxiv.org/abs/2305.07336v2","updated":"2023-08-01T09:16:32Z","published":"2023-05-12T09:28:09Z","title":"MotionBEV: Attention-Aware Online LiDAR Moving Object Segmentation with\n  Bird's Eye View based Appearance and Motion Features","summary":"  Identifying moving objects is an essential capability for autonomous systems,\nas it provides critical information for pose estimation, navigation, collision\navoidance, and static map construction. In this paper, we present MotionBEV, a\nfast and accurate framework for LiDAR moving object segmentation, which\nsegments moving objects with appearance and motion features in the bird's eye\nview (BEV) domain. Our approach converts 3D LiDAR scans into a 2D polar BEV\nrepresentation to improve computational efficiency. Specifically, we learn\nappearance features with a simplified PointNet and compute motion features\nthrough the height differences of consecutive frames of point clouds projected\nonto vertical columns in the polar BEV coordinate system. We employ a\ndual-branch network bridged by the Appearance-Motion Co-attention Module (AMCM)\nto adaptively fuse the spatio-temporal information from appearance and motion\nfeatures. Our approach achieves state-of-the-art performance on the\nSemanticKITTI-MOS benchmark. Furthermore, to demonstrate the practical\neffectiveness of our method, we provide a LiDAR-MOS dataset recorded by a\nsolid-state LiDAR, which features non-repetitive scanning patterns and a small\nfield of view.\n","authors":["Bo Zhou","Jiapeng Xie","Yan Pan","Jiajie Wu","Chuanzhao Lu"],"pdf_url":"https://arxiv.org/pdf/2305.07336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00394v1","updated":"2023-08-01T09:14:20Z","published":"2023-08-01T09:14:20Z","title":"On the Generation of a Synthetic Event-Based Vision Dataset for\n  Navigation and Landing","summary":"  An event-based camera outputs an event whenever a change in scene brightness\nof a preset magnitude is detected at a particular pixel location in the sensor\nplane. The resulting sparse and asynchronous output coupled with the high\ndynamic range and temporal resolution of this novel camera motivate the study\nof event-based cameras for navigation and landing applications. However, the\nlack of real-world and synthetic datasets to support this line of research has\nlimited its consideration for onboard use. This paper presents a methodology\nand a software pipeline for generating event-based vision datasets from optimal\nlanding trajectories during the approach of a target body. We construct\nsequences of photorealistic images of the lunar surface with the Planet and\nAsteroid Natural Scene Generation Utility at different viewpoints along a set\nof optimal descent trajectories obtained by varying the boundary conditions.\nThe generated image sequences are then converted into event streams by means of\nan event-based camera emulator. We demonstrate that the pipeline can generate\nrealistic event-based representations of surface features by constructing a\ndataset of 500 trajectories, complete with event streams and motion field\nground truth data. We anticipate that novel event-based vision datasets can be\ngenerated using this pipeline to support various spacecraft pose reconstruction\nproblems given events as input, and we hope that the proposed methodology would\nattract the attention of researchers working at the intersection of\nneuromorphic vision and guidance navigation and control.\n","authors":["Loïc J. Azzalini","Emmanuel Blazquez","Alexander Hadjiivanov","Gabriele Meoni","Dario Izzo"],"pdf_url":"https://arxiv.org/pdf/2308.00394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12073v2","updated":"2023-08-01T08:47:59Z","published":"2023-05-20T03:22:43Z","title":"GELU Activation Function in Deep Learning: A Comprehensive Mathematical\n  Analysis and Performance","summary":"  Selecting the most suitable activation function is a critical factor in the\neffectiveness of deep learning models, as it influences their learning\ncapacity, stability, and computational efficiency. In recent years, the\nGaussian Error Linear Unit (GELU) activation function has emerged as a dominant\nmethod, surpassing traditional functions such as the Rectified Linear Unit\n(ReLU) in various applications. This study presents a rigorous mathematical\ninvestigation of the GELU activation function, exploring its differentiability,\nboundedness, stationarity, and smoothness properties in detail. Additionally,\nwe conduct an extensive experimental comparison of the GELU function against a\nbroad range of alternative activation functions, utilizing a residual\nconvolutional network trained on the CIFAR-10, CIFAR-100, and STL-10 datasets\nas the empirical testbed. Our results demonstrate the superior performance of\nGELU compared to other activation functions, establishing its suitability for a\nwide range of deep learning applications. This comprehensive study contributes\nto a more profound understanding of the underlying mathematical properties of\nGELU and provides valuable insights for practitioners aiming to select\nactivation functions that optimally align with their specific objectives and\nconstraints in deep learning.\n","authors":["Minhyeok Lee"],"pdf_url":"https://arxiv.org/pdf/2305.12073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00377v1","updated":"2023-08-01T08:40:40Z","published":"2023-08-01T08:40:40Z","title":"Shape Completion with Prediction of Uncertain Regions","summary":"  Shape completion, i.e., predicting the complete geometry of an object from a\npartial observation, is highly relevant for several downstream tasks, most\nnotably robotic manipulation. When basing planning or prediction of real grasps\non object shape reconstruction, an indication of severe geometric uncertainty\nis indispensable. In particular, there can be an irreducible uncertainty in\nextended regions about the presence of entire object parts when given ambiguous\nobject views. To treat this important case, we propose two novel methods for\npredicting such uncertain regions as straightforward extensions of any method\nfor predicting local spatial occupancy, one through postprocessing occupancy\nscores, the other through direct prediction of an uncertainty indicator. We\ncompare these methods together with two known approaches to probabilistic shape\ncompletion. Moreover, we generate a dataset, derived from ShapeNet, of\nrealistically rendered depth images of object views with ground-truth\nannotations for the uncertain regions. We train on this dataset and test each\nmethod in shape completion and prediction of uncertain regions for known and\nnovel object instances and on synthetic and real data. While direct uncertainty\nprediction is by far the most accurate in the segmentation of uncertain\nregions, both novel methods outperform the two baselines in shape completion\nand uncertain region prediction, and avoiding the predicted uncertain regions\nincreases the quality of grasps for all tested methods. Web:\nhttps://github.com/DLR-RM/shape-completion\n","authors":["Matthias Humt","Dominik Winkelbauer","Ulrich Hillenbrand"],"pdf_url":"https://arxiv.org/pdf/2308.00377v1.pdf","comment":"7 pages, 5 figures, 2023 IEEE/RSJ International Conference on\n  Intelligent Robots and Systems, IROS 2023"},{"id":"http://arxiv.org/abs/2308.00376v1","updated":"2023-08-01T08:40:23Z","published":"2023-08-01T08:40:23Z","title":"Deep Image Harmonization with Learnable Augmentation","summary":"  The goal of image harmonization is adjusting the foreground appearance in a\ncomposite image to make the whole image harmonious. To construct paired\ntraining images, existing datasets adopt different ways to adjust the\nillumination statistics of foregrounds of real images to produce synthetic\ncomposite images. However, different datasets have considerable domain gap and\nthe performances on small-scale datasets are limited by insufficient training\ndata. In this work, we explore learnable augmentation to enrich the\nillumination diversity of small-scale datasets for better harmonization\nperformance. In particular, our designed SYthetic COmposite Network (SycoNet)\ntakes in a real image with foreground mask and a random vector to learn\nsuitable color transformation, which is applied to the foreground of this real\nimage to produce a synthetic composite image. Comprehensive experiments\ndemonstrate the effectiveness of our proposed learnable augmentation for image\nharmonization. The code of SycoNet is released at\nhttps://github.com/bcmi/SycoNet-Adaptive-Image-Harmonization.\n","authors":["Li Niu","Junyan Cao","Wenyan Cong","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00376v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00356v1","updated":"2023-08-01T07:53:25Z","published":"2023-08-01T07:53:25Z","title":"Deep Image Harmonization with Globally Guided Feature Transformation and\n  Relation Distillation","summary":"  Given a composite image, image harmonization aims to adjust the foreground\nillumination to be consistent with background. Previous methods have explored\ntransforming foreground features to achieve competitive performance. In this\nwork, we show that using global information to guide foreground feature\ntransformation could achieve significant improvement. Besides, we propose to\ntransfer the foreground-background relation from real images to composite\nimages, which can provide intermediate supervision for the transformed encoder\nfeatures. Additionally, considering the drawbacks of existing harmonization\ndatasets, we also contribute a ccHarmony dataset which simulates the natural\nillumination variation. Extensive experiments on iHarmony4 and our contributed\ndataset demonstrate the superiority of our method. Our ccHarmony dataset is\nreleased at https://github.com/bcmi/Image-Harmonization-Dataset-ccHarmony.\n","authors":["Li Niu","Linfeng Tan","Xinhao Tao","Junyan Cao","Fengjun Guo","Teng Long","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00356v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00353v1","updated":"2023-08-01T07:50:14Z","published":"2023-08-01T07:50:14Z","title":"Lowis3D: Language-Driven Open-World Instance-Level 3D Scene\n  Understanding","summary":"  Open-world instance-level scene understanding aims to locate and recognize\nunseen object categories that are not present in the annotated dataset. This\ntask is challenging because the model needs to both localize novel 3D objects\nand infer their semantic categories. A key factor for the recent progress in 2D\nopen-world perception is the availability of large-scale image-text pairs from\nthe Internet, which cover a wide range of vocabulary concepts. However, this\nsuccess is hard to replicate in 3D scenarios due to the scarcity of 3D-text\npairs. To address this challenge, we propose to harness pre-trained\nvision-language (VL) foundation models that encode extensive knowledge from\nimage-text pairs to generate captions for multi-view images of 3D scenes. This\nallows us to establish explicit associations between 3D shapes and\nsemantic-rich captions. Moreover, to enhance the fine-grained visual-semantic\nrepresentation learning from captions for object-level categorization, we\ndesign hierarchical point-caption association methods to learn semantic-aware\nembeddings that exploit the 3D geometry between 3D points and multi-view\nimages. In addition, to tackle the localization challenge for novel classes in\nthe open-world setting, we develop debiased instance localization, which\ninvolves training object grouping modules on unlabeled data using\ninstance-level pseudo supervision. This significantly improves the\ngeneralization capabilities of instance grouping and thus the ability to\naccurately locate novel objects. We conduct extensive experiments on 3D\nsemantic, instance, and panoptic segmentation tasks, covering indoor and\noutdoor scenes across three datasets. Our method outperforms baseline methods\nby a significant margin in semantic segmentation (e.g. 34.5%$\\sim$65.3%),\ninstance segmentation (e.g. 21.8%$\\sim$54.0%) and panoptic segmentation (e.g.\n14.7%$\\sim$43.3%). Code will be available.\n","authors":["Runyu Ding","Jihan Yang","Chuhui Xue","Wenqing Zhang","Song Bai","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2308.00353v1.pdf","comment":"submit to TPAMI"},{"id":"http://arxiv.org/abs/2303.17859v2","updated":"2023-08-01T07:36:59Z","published":"2023-03-31T07:39:12Z","title":"MapFormer: Boosting Change Detection by Using Pre-change Information","summary":"  Change detection in remote sensing imagery is essential for a variety of\napplications such as urban planning, disaster management, and climate research.\nHowever, existing methods for identifying semantically changed areas overlook\nthe availability of semantic information in the form of existing maps\ndescribing features of the earth's surface. In this paper, we leverage this\ninformation for change detection in bi-temporal images. We show that the simple\nintegration of the additional information via concatenation of latent\nrepresentations suffices to significantly outperform state-of-the-art change\ndetection methods. Motivated by this observation, we propose the new task of\n*Conditional Change Detection*, where pre-change semantic information is used\nas input next to bi-temporal images. To fully exploit the extra information, we\npropose *MapFormer*, a novel architecture based on a multi-modal feature fusion\nmodule that allows for feature processing conditioned on the available semantic\ninformation. We further employ a supervised, cross-modal contrastive loss to\nguide the learning of visual representations. Our approach outperforms existing\nchange detection methods by an absolute 11.7\\% and 18.4\\% in terms of binary\nchange IoU on DynamicEarthNet and HRSCD, respectively. Furthermore, we\ndemonstrate the robustness of our approach to the quality of the pre-change\nsemantic information and the absence pre-change imagery. The code is available\nat https://github.com/mxbh/mapformer.\n","authors":["Maximilian Bernhard","Niklas Strauß","Matthias Schubert"],"pdf_url":"https://arxiv.org/pdf/2303.17859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07468v2","updated":"2023-08-01T07:33:15Z","published":"2023-01-18T12:23:10Z","title":"Model-based inexact graph matching on top of CNNs for semantic scene\n  understanding","summary":"  Deep learning based pipelines for semantic segmentation often ignore\nstructural information available on annotated images used for training. We\npropose a novel post-processing module enforcing structural knowledge about the\nobjects of interest to improve segmentation results provided by deep learning.\nThis module corresponds to a \"many-to-one-or-none\" inexact graph matching\napproach, and is formulated as a quadratic assignment problem. Our approach is\ncompared to a CNN-based segmentation (for various CNN backbones) on two public\ndatasets, one for face segmentation from 2D RGB images (FASSEG), and the other\nfor brain segmentation from 3D MRIs (IBSR). Evaluations are performed using two\ntypes of structural information (distances and directional relations, , this\nchoice being a hyper-parameter of our generic framework). On FASSEG data,\nresults show that our module improves accuracy of the CNN by about 6.3% (the\nHausdorff distance decreases from 22.11 to 20.71). On IBSR data, the\nimprovement is of 51% (the Hausdorff distance decreases from 11.01 to 5.4). In\naddition, our approach is shown to be resilient to small training datasets that\noften limit the performance of deep learning methods: the improvement increases\nas the size of the training dataset decreases.\n","authors":["Jérémy Chopin","Jean-Baptiste Fasquel","Harold Mouchère","Rozenn Dahyot","Isabelle Bloch"],"pdf_url":"https://arxiv.org/pdf/2301.07468v2.pdf","comment":"27 pages, 9 figures, 11 tables"},{"id":"http://arxiv.org/abs/2302.11710v2","updated":"2023-08-01T07:33:10Z","published":"2023-02-23T00:10:40Z","title":"Controlled and Conditional Text to Image Generation with Diffusion Prior","summary":"  Denoising Diffusion models have shown remarkable performance in generating\ndiverse, high quality images from text. Numerous techniques have been proposed\non top of or in alignment with models like Stable Diffusion and Imagen that\ngenerate images directly from text. A lesser explored approach is DALLE-2's two\nstep process comprising a Diffusion Prior that generates a CLIP image embedding\nfrom text and a Diffusion Decoder that generates an image from a CLIP image\nembedding. We explore the capabilities of the Diffusion Prior and the\nadvantages of an intermediate CLIP representation. We observe that Diffusion\nPrior can be used in a memory and compute efficient way to constrain the\ngeneration to a specific domain without altering the larger Diffusion Decoder.\nMoreover, we show that the Diffusion Prior can be trained with additional\nconditional information such as color histogram to further control the\ngeneration. We show quantitatively and qualitatively that the proposed\napproaches perform better than prompt engineering for domain specific\ngeneration and existing baselines for color conditioned generation. We believe\nthat our observations and results will instigate further research into the\ndiffusion prior and uncover more of its capabilities.\n","authors":["Pranav Aggarwal","Hareesh Ravi","Naveen Marri","Sachin Kelkar","Fengbin Chen","Vinh Khuc","Midhun Harikumar","Ritiz Tambi","Sudharshan Reddy Kakumanu","Purvak Lapsiya","Alvin Ghouas","Sarah Saber","Malavika Ramprasad","Baldo Faieta","Ajinkya Kale"],"pdf_url":"https://arxiv.org/pdf/2302.11710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08930v4","updated":"2023-08-01T07:23:27Z","published":"2023-07-18T02:35:01Z","title":"Unsupervised Deep Graph Matching Based on Cycle Consistency","summary":"  We contribute to the sparsely populated area of unsupervised deep graph\nmatching with application to keypoint matching in images. Contrary to the\nstandard \\emph{supervised} approach, our method does not require ground truth\ncorrespondences between keypoint pairs. Instead, it is self-supervised by\nenforcing consistency of matchings between images of the same object category.\nAs the matching and the consistency loss are discrete, their derivatives cannot\nbe straightforwardly used for learning. We address this issue in a principled\nway by building our method upon the recent results on black-box differentiation\nof combinatorial solvers. This makes our method exceptionally flexible, as it\nis compatible with arbitrary network architectures and combinatorial solvers.\nOur experimental evaluation suggests that our technique sets a new\nstate-of-the-art for unsupervised graph matching.\n","authors":["Siddharth Tourani","Carsten Rother","Muhammad Haris Khan","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2307.08930v4.pdf","comment":"12 pages, 5 figures, 3 papers"},{"id":"http://arxiv.org/abs/2307.14051v3","updated":"2023-08-01T07:04:25Z","published":"2023-07-26T09:04:27Z","title":"3D Semantic Subspace Traverser: Empowering 3D Generative Model with\n  Shape Editing Capability","summary":"  Shape generation is the practice of producing 3D shapes as various\nrepresentations for 3D content creation. Previous studies on 3D shape\ngeneration have focused on shape quality and structure, without or less\nconsidering the importance of semantic information. Consequently, such\ngenerative models often fail to preserve the semantic consistency of shape\nstructure or enable manipulation of the semantic attributes of shapes during\ngeneration. In this paper, we proposed a novel semantic generative model named\n3D Semantic Subspace Traverser that utilizes semantic attributes for\ncategory-specific 3D shape generation and editing. Our method utilizes implicit\nfunctions as the 3D shape representation and combines a novel latent-space GAN\nwith a linear subspace model to discover semantic dimensions in the local\nlatent space of 3D shapes. Each dimension of the subspace corresponds to a\nparticular semantic attribute, and we can edit the attributes of generated\nshapes by traversing the coefficients of those dimensions. Experimental results\ndemonstrate that our method can produce plausible shapes with complex\nstructures and enable the editing of semantic attributes. The code and trained\nmodels are available at\nhttps://github.com/TrepangCat/3D_Semantic_Subspace_Traverser\n","authors":["Ruowei Wang","Yu Liu","Pei Su","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14051v3.pdf","comment":"Published in ICCV 2023. Code:\n  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser"},{"id":"http://arxiv.org/abs/2308.00323v1","updated":"2023-08-01T07:00:13Z","published":"2023-08-01T07:00:13Z","title":"Fine-Grained Sports, Yoga, and Dance Postures Recognition: A Benchmark\n  Analysis","summary":"  Human body-pose estimation is a complex problem in computer vision. Recent\nresearch interests have been widened specifically on the Sports, Yoga, and\nDance (SYD) postures for maintaining health conditions. The SYD pose categories\nare regarded as a fine-grained image classification task due to the complex\nmovement of body parts. Deep Convolutional Neural Networks (CNNs) have attained\nsignificantly improved performance in solving various human body-pose\nestimation problems. Though decent progress has been achieved in yoga postures\nrecognition using deep learning techniques, fine-grained sports, and dance\nrecognition necessitates ample research attention. However, no benchmark public\nimage dataset with sufficient inter-class and intra-class variations is\navailable yet to address sports and dance postures classification. To solve\nthis limitation, we have proposed two image datasets, one for 102 sport\ncategories and another for 12 dance styles. Two public datasets, Yoga-82 which\ncontains 82 classes and Yoga-107 represents 107 classes are collected for yoga\npostures. These four SYD datasets are experimented with the proposed deep\nmodel, SYD-Net, which integrates a patch-based attention (PbA) mechanism on top\nof standard backbone CNNs. The PbA module leverages the self-attention\nmechanism that learns contextual information from a set of uniform and\nmulti-scale patches and emphasizes discriminative features to understand the\nsemantic correlation among patches. Moreover, random erasing data augmentation\nis applied to improve performance. The proposed SYD-Net has achieved\nstate-of-the-art accuracy on Yoga-82 using five base CNNs. SYD-Net's accuracy\non other datasets is remarkable, implying its efficiency. Our Sports-102 and\nDance-12 datasets are publicly available at\nhttps://sites.google.com/view/syd-net/home.\n","authors":["Asish Bera","Mita Nasipuri","Ondrej Krejcar","Debotosh Bhattacharjee"],"pdf_url":"https://arxiv.org/pdf/2308.00323v1.pdf","comment":"12 pages, 12 figures, 10 tables"},{"id":"http://arxiv.org/abs/2307.02862v2","updated":"2023-08-01T06:47:27Z","published":"2023-07-06T08:57:53Z","title":"A Critical Look at the Current Usage of Foundation Model for Dense\n  Recognition Task","summary":"  In recent years large model trained on huge amount of cross-modality data,\nwhich is usually be termed as foundation model, achieves conspicuous\naccomplishment in many fields, such as image recognition and generation. Though\nachieving great success in their original application case, it is still unclear\nwhether those foundation models can be applied to other different downstream\ntasks. In this paper, we conduct a short survey on the current methods for\ndiscriminative dense recognition tasks, which are built on the pretrained\nfoundation model. And we also provide some preliminary experimental analysis of\nan existing open-vocabulary segmentation method based on Stable Diffusion,\nwhich indicates the current way of deploying diffusion model for segmentation\nis not optimal. This aims to provide insights for future research on adopting\nfoundation model for downstream task.\n","authors":["Shiqi Yang","Atsushi Hashimoto","Yoshitaka Ushiku"],"pdf_url":"https://arxiv.org/pdf/2307.02862v2.pdf","comment":"This is a short report on the current usage of foundation model\n  (mainly pretrained diffusion model) for downstream dense recognition task\n  (e.g., open vocabulary segmentation). We hope this short report could give an\n  insight to the future research"},{"id":"http://arxiv.org/abs/2307.16419v2","updated":"2023-08-01T06:45:22Z","published":"2023-07-31T05:59:09Z","title":"Subspace Distillation for Continual Learning","summary":"  An ultimate objective in continual learning is to preserve knowledge learned\nin preceding tasks while learning new tasks. To mitigate forgetting prior\nknowledge, we propose a novel knowledge distillation technique that takes into\nthe account the manifold structure of the latent/output space of a neural\nnetwork in learning novel tasks. To achieve this, we propose to approximate the\ndata manifold up-to its first order, hence benefiting from linear subspaces to\nmodel the structure and maintain the knowledge of a neural network while\nlearning novel concepts. We demonstrate that the modeling with subspaces\nprovides several intriguing properties, including robustness to noise and\ntherefore effective for mitigating Catastrophic Forgetting in continual\nlearning. We also discuss and show how our proposed method can be adopted to\naddress both classification and segmentation problems. Empirically, we observe\nthat our proposed method outperforms various continual learning methods on\nseveral challenging datasets including Pascal VOC, and Tiny-Imagenet.\nFurthermore, we show how the proposed method can be seamlessly combined with\nexisting learning approaches to improve their performances. The codes of this\narticle will be available at https://github.com/csiro-robotics/SDCL.\n","authors":["Kaushik Roy","Christian Simon","Peyman Moghadam","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2307.16419v2.pdf","comment":"Neural Networks (submitted May 2022, accepted July 2023)"},{"id":"http://arxiv.org/abs/2308.00313v1","updated":"2023-08-01T06:19:13Z","published":"2023-08-01T06:19:13Z","title":"Zero-Shot Learning by Harnessing Adversarial Samples","summary":"  Zero-Shot Learning (ZSL) aims to recognize unseen classes by generalizing the\nknowledge, i.e., visual and semantic relationships, obtained from seen classes,\nwhere image augmentation techniques are commonly applied to improve the\ngeneralization ability of a model. However, this approach can also cause\nadverse effects on ZSL since the conventional augmentation techniques that\nsolely depend on single-label supervision is not able to maintain semantic\ninformation and result in the semantic distortion issue consequently. In other\nwords, image argumentation may falsify the semantic (e.g., attribute)\ninformation of an image. To take the advantage of image augmentations while\nmitigating the semantic distortion issue, we propose a novel ZSL approach by\nHarnessing Adversarial Samples (HAS). HAS advances ZSL through adversarial\ntraining which takes into account three crucial aspects: (1) robust generation\nby enforcing augmentations to be similar to negative classes, while maintaining\ncorrect labels, (2) reliable generation by introducing a latent space\nconstraint to avert significant deviations from the original data manifold, and\n(3) diverse generation by incorporating attribute-based perturbation by\nadjusting images according to each semantic attribute's localization. Through\ncomprehensive experiments on three prominent zero-shot benchmark datasets, we\ndemonstrate the effectiveness of our adversarial samples approach in both ZSL\nand Generalized Zero-Shot Learning (GZSL) scenarios. Our source code is\navailable at https://github.com/uqzhichen/HASZSL.\n","authors":["Zhi Chen","Pengfei Zhang","Jingjing Li","Sen Wang","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.00313v1.pdf","comment":"Accepted to ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2308.00310v1","updated":"2023-08-01T06:12:12Z","published":"2023-08-01T06:12:12Z","title":"GradOrth: A Simple yet Efficient Out-of-Distribution Detection with\n  Orthogonal Projection of Gradients","summary":"  Detecting out-of-distribution (OOD) data is crucial for ensuring the safe\ndeployment of machine learning models in real-world applications. However,\nexisting OOD detection approaches primarily rely on the feature maps or the\nfull gradient space information to derive OOD scores neglecting the role of\nmost important parameters of the pre-trained network over in-distribution (ID)\ndata. In this study, we propose a novel approach called GradOrth to facilitate\nOOD detection based on one intriguing observation that the important features\nto identify OOD data lie in the lower-rank subspace of in-distribution (ID)\ndata. In particular, we identify OOD data by computing the norm of gradient\nprojection on the subspaces considered important for the in-distribution data.\nA large orthogonal projection value (i.e. a small projection value) indicates\nthe sample as OOD as it captures a weak correlation of the ID data. This simple\nyet effective method exhibits outstanding performance, showcasing a notable\nreduction in the average false positive rate at a 95% true positive rate\n(FPR95) of up to 8% when compared to the current state-of-the-art methods.\n","authors":["Sima Behpour","Thang Doan","Xin Li","Wenbin He","Liang Gou","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2308.00310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00307v1","updated":"2023-08-01T05:59:02Z","published":"2023-08-01T05:59:02Z","title":"Domain Adaptation based on Human Feedback for Enhancing Generative Model\n  Denoising Abilities","summary":"  How can we apply human feedback into generative model? As answer of this\nquestion, in this paper, we show the method applied on denoising problem and\ndomain adaptation using human feedback. Deep generative models have\ndemonstrated impressive results in image denoising. However, current image\ndenoising models often produce inappropriate results when applied to domains\ndifferent from the ones they were trained on. If there are `Good' and `Bad'\nresult for unseen data, how to raise up quality of `Bad' result. Most methods\nuse an approach based on generalization of model. However, these methods\nrequire target image for training or adapting unseen domain. In this paper, to\nadapting domain, we deal with non-target image for unseen domain, and improve\nspecific failed image. To address this, we propose a method for fine-tuning\ninappropriate results generated in a different domain by utilizing human\nfeedback. First, we train a generator to denoise images using only the noisy\nMNIST digit '0' images. The denoising generator trained on the source domain\nleads to unintended results when applied to target domain images. To achieve\ndomain adaptation, we construct a noise-image denoising generated image data\nset and train a reward model predict human feedback. Finally, we fine-tune the\ngenerator on the different domain using the reward model with auxiliary loss\nfunction, aiming to transfer denoising capabilities to target domain. Our\napproach demonstrates the potential to efficiently fine-tune a generator\ntrained on one domain using human feedback from another domain, thereby\nenhancing denoising abilities in different domains.\n","authors":["Hyun-Cheol Park","Sung Ho Kang"],"pdf_url":"https://arxiv.org/pdf/2308.00307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00303v1","updated":"2023-08-01T05:50:33Z","published":"2023-08-01T05:50:33Z","title":"Diffusion Model for Camouflaged Object Detection","summary":"  Camouflaged object detection is a challenging task that aims to identify\nobjects that are highly similar to their background. Due to the powerful\nnoise-to-image denoising capability of denoising diffusion models, in this\npaper, we propose a diffusion-based framework for camouflaged object detection,\ntermed diffCOD, a new framework that considers the camouflaged object\nsegmentation task as a denoising diffusion process from noisy masks to object\nmasks. Specifically, the object mask diffuses from the ground-truth masks to a\nrandom distribution, and the designed model learns to reverse this noising\nprocess. To strengthen the denoising learning, the input image prior is encoded\nand integrated into the denoising diffusion model to guide the diffusion\nprocess. Furthermore, we design an injection attention module (IAM) to interact\nconditional semantic features extracted from the image with the diffusion noise\nembedding via the cross-attention mechanism to enhance denoising learning.\nExtensive experiments on four widely used COD benchmark datasets demonstrate\nthat the proposed method achieves favorable performance compared to the\nexisting 11 state-of-the-art methods, especially in the detailed texture\nsegmentation of camouflaged objects. Our code will be made publicly available\nat: https://github.com/ZNan-Chen/diffCOD.\n","authors":["Zhennan Chen","Rongrong Gao","Tian-Zhu Xiang","Fan Lin"],"pdf_url":"https://arxiv.org/pdf/2308.00303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00301v1","updated":"2023-08-01T05:46:40Z","published":"2023-08-01T05:46:40Z","title":"Online Prototype Learning for Online Continual Learning","summary":"  Online continual learning (CL) studies the problem of learning continuously\nfrom a single-pass data stream while adapting to new data and mitigating\ncatastrophic forgetting. Recently, by storing a small subset of old data,\nreplay-based methods have shown promising performance. Unlike previous methods\nthat focus on sample storage or knowledge distillation against catastrophic\nforgetting, this paper aims to understand why the online learning models fail\nto generalize well from a new perspective of shortcut learning. We identify\nshortcut learning as the key limiting factor for online CL, where the learned\nfeatures may be biased, not generalizable to new tasks, and may have an adverse\nimpact on knowledge distillation. To tackle this issue, we present the online\nprototype learning (OnPro) framework for online CL. First, we propose online\nprototype equilibrium to learn representative features against shortcut\nlearning and discriminative features to avoid class confusion, ultimately\nachieving an equilibrium status that separates all seen classes well while\nlearning new classes. Second, with the feedback of online prototypes, we devise\na novel adaptive prototypical feedback mechanism to sense the classes that are\neasily misclassified and then enhance their boundaries. Extensive experimental\nresults on widely-used benchmark datasets demonstrate the superior performance\nof OnPro over the state-of-the-art baseline methods. Source code is available\nat https://github.com/weilllllls/OnPro.\n","authors":["Yujie Wei","Jiaxin Ye","Zhizhong Huang","Junping Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2308.00301v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.16210v2","updated":"2023-08-01T05:35:51Z","published":"2023-07-30T12:16:49Z","title":"Rethinking Uncertainly Missing and Ambiguous Visual Modality in\n  Multi-Modal Entity Alignment","summary":"  As a crucial extension of entity alignment (EA), multi-modal entity alignment\n(MMEA) aims to identify identical entities across disparate knowledge graphs\n(KGs) by exploiting associated visual information. However, existing MMEA\napproaches primarily concentrate on the fusion paradigm of multi-modal entity\nfeatures, while neglecting the challenges presented by the pervasive phenomenon\nof missing and intrinsic ambiguity of visual images. In this paper, we present\na further analysis of visual modality incompleteness, benchmarking latest MMEA\nmodels on our proposed dataset MMEA-UMVM, where the types of alignment KGs\ncovering bilingual and monolingual, with standard (non-iterative) and iterative\ntraining paradigms to evaluate the model performance. Our research indicates\nthat, in the face of modality incompleteness, models succumb to overfitting the\nmodality noise, and exhibit performance oscillations or declines at high rates\nof missing modality. This proves that the inclusion of additional multi-modal\ndata can sometimes adversely affect EA. To address these challenges, we\nintroduce UMAEA , a robust multi-modal entity alignment approach designed to\ntackle uncertainly missing and ambiguous visual modalities. It consistently\nachieves SOTA performance across all 97 benchmark splits, significantly\nsurpassing existing baselines with limited parameters and time consumption,\nwhile effectively alleviating the identified limitations of other models. Our\ncode and benchmark data are available at https://github.com/zjukg/UMAEA.\n","authors":["Zhuo Chen","Lingbing Guo","Yin Fang","Yichi Zhang","Jiaoyan Chen","Jeff Z. Pan","Yangning Li","Huajun Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.16210v2.pdf","comment":"International Semantic Web Conference '23 (ISWC 2023),\n  https://github.com/zjukg/UMAEA"},{"id":"http://arxiv.org/abs/2308.00295v1","updated":"2023-08-01T05:28:13Z","published":"2023-08-01T05:28:13Z","title":"Making the V in Text-VQA Matter","summary":"  Text-based VQA aims at answering questions by reading the text present in the\nimages. It requires a large amount of scene-text relationship understanding\ncompared to the VQA task. Recent studies have shown that the question-answer\npairs in the dataset are more focused on the text present in the image but less\nimportance is given to visual features and some questions do not require\nunderstanding the image. The models trained on this dataset predict biased\nanswers due to the lack of understanding of visual context. For example, in\nquestions like \"What is written on the signboard?\", the answer predicted by the\nmodel is always \"STOP\" which makes the model to ignore the image. To address\nthese issues, we propose a method to learn visual features (making V matter in\nTextVQA) along with the OCR features and question features using VQA dataset as\nexternal knowledge for Text-based VQA. Specifically, we combine the TextVQA\ndataset and VQA dataset and train the model on this combined dataset. Such a\nsimple, yet effective approach increases the understanding and correlation\nbetween the image features and text present in the image, which helps in the\nbetter answering of questions. We further test the model on different datasets\nand compare their qualitative and quantitative results.\n","authors":["Shamanthak Hegde","Soumya Jahagirdar","Shankar Gangisetty"],"pdf_url":"https://arxiv.org/pdf/2308.00295v1.pdf","comment":"Accepted for the CVPR 2023 Workshop on Open-Domain Reasoning Under\n  Multi-Modal Settings"},{"id":"http://arxiv.org/abs/2306.17358v2","updated":"2023-08-01T05:15:35Z","published":"2023-06-30T01:32:16Z","title":"RdSOBA: Rendered Shadow-Object Association Dataset","summary":"  Image composition refers to inserting a foreground object into a background\nimage to obtain a composite image. In this work, we focus on generating\nplausible shadows for the inserted foreground object to make the composite\nimage more realistic. To supplement the existing small-scale dataset DESOBA, we\ncreated a large-scale dataset called RdSOBA with 3D rendering techniques.\nSpecifically, we place a group of 3D objects in the 3D scene, and get the\nimages without or with object shadows using controllable rendering techniques.\nDataset is available at\nhttps://github.com/bcmi/Rendered-Shadow-Generation-Dataset-RdSOBA.\n","authors":["Xinhao Tao","Junyan Cao","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2306.17358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00291v1","updated":"2023-08-01T05:13:02Z","published":"2023-08-01T05:13:02Z","title":"Fundus-Enhanced Disease-Aware Distillation Model for Retinal Disease\n  Classification from OCT Images","summary":"  Optical Coherence Tomography (OCT) is a novel and effective screening tool\nfor ophthalmic examination. Since collecting OCT images is relatively more\nexpensive than fundus photographs, existing methods use multi-modal learning to\ncomplement limited OCT data with additional context from fundus images.\nHowever, the multi-modal framework requires eye-paired datasets of both\nmodalities, which is impractical for clinical use. To address this problem, we\npropose a novel fundus-enhanced disease-aware distillation model (FDDM), for\nretinal disease classification from OCT images. Our framework enhances the OCT\nmodel during training by utilizing unpaired fundus images and does not require\nthe use of fundus images during testing, which greatly improves the\npracticality and efficiency of our method for clinical use. Specifically, we\npropose a novel class prototype matching to distill disease-related information\nfrom the fundus model to the OCT model and a novel class similarity alignment\nto enforce consistency between disease distribution of both modalities.\nExperimental results show that our proposed approach outperforms single-modal,\nmulti-modal, and state-of-the-art distillation methods for retinal disease\nclassification. Code is available at https://github.com/xmed-lab/FDDM.\n","authors":["Lehan Wang","Weihang Dai","Mei Jin","Chubin Ou","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.00291v1.pdf","comment":"Accepted as a conference paper at MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.12280v2","updated":"2023-08-01T05:11:18Z","published":"2023-03-22T03:13:55Z","title":"NLOS-NeuS: Non-line-of-sight Neural Implicit Surface","summary":"  Non-line-of-sight (NLOS) imaging is conducted to infer invisible scenes from\nindirect light on visible objects. The neural transient field (NeTF) was\nproposed for representing scenes as neural radiance fields in NLOS scenes. We\npropose NLOS neural implicit surface (NLOS-NeuS), which extends the NeTF to\nneural implicit surfaces with a signed distance function (SDF) for\nreconstructing three-dimensional surfaces in NLOS scenes. We introduce two\nconstraints as loss functions for correctly learning an SDF to avoid non-zero\nlevel-set surfaces. We also introduce a lower bound constraint of an SDF based\non the geometry of the first-returning photons. The experimental results\nindicate that these constraints are essential for learning a correct SDF in\nNLOS scenes. Compared with previous methods with discretized representation,\nNLOS-NeuS with the neural continuous representation enables us to reconstruct\nsmooth surfaces while preserving fine details in NLOS scenes. To the best of\nour knowledge, this is the first study on neural implicit surfaces with volume\nrendering in NLOS scenes.\n","authors":["Yuki Fujimura","Takahiro Kushida","Takuya Funatomi","Yasuhiro Mukaigawa"],"pdf_url":"https://arxiv.org/pdf/2303.12280v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00287v1","updated":"2023-08-01T05:01:05Z","published":"2023-08-01T05:01:05Z","title":"A Study of Unsupervised Evaluation Metrics for Practical and Automatic\n  Domain Adaptation","summary":"  Unsupervised domain adaptation (UDA) methods facilitate the transfer of\nmodels to target domains without labels. However, these methods necessitate a\nlabeled target validation set for hyper-parameter tuning and model selection.\nIn this paper, we aim to find an evaluation metric capable of assessing the\nquality of a transferred model without access to target validation labels. We\nbegin with the metric based on mutual information of the model prediction.\nThrough empirical analysis, we identify three prevalent issues with this\nmetric: 1) It does not account for the source structure. 2) It can be easily\nattacked. 3) It fails to detect negative transfer caused by the over-alignment\nof source and target features. To address the first two issues, we incorporate\nsource accuracy into the metric and employ a new MLP classifier that is held\nout during training, significantly improving the result. To tackle the final\nissue, we integrate this enhanced metric with data augmentation, resulting in a\nnovel unsupervised UDA metric called the Augmentation Consistency Metric (ACM).\nAdditionally, we empirically demonstrate the shortcomings of previous\nexperiment settings and conduct large-scale experiments to validate the\neffectiveness of our proposed metric. Furthermore, we employ our metric to\nautomatically search for the optimal hyper-parameter set, achieving superior\nperformance compared to manually tuned sets across four common benchmarks.\nCodes will be available soon.\n","authors":["Minghao Chen","Zepeng Gao","Shuai Zhao","Qibo Qiu","Wenxiao Wang","Binbin Lin","Xiaofei He"],"pdf_url":"https://arxiv.org/pdf/2308.00287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00279v1","updated":"2023-08-01T04:34:52Z","published":"2023-08-01T04:34:52Z","title":"Robust Positive-Unlabeled Learning via Noise Negative Sample\n  Self-correction","summary":"  Learning from positive and unlabeled data is known as positive-unlabeled (PU)\nlearning in literature and has attracted much attention in recent years. One\ncommon approach in PU learning is to sample a set of pseudo-negatives from the\nunlabeled data using ad-hoc thresholds so that conventional supervised methods\ncan be applied with both positive and negative samples. Owing to the label\nuncertainty among the unlabeled data, errors of misclassifying unlabeled\npositive samples as negative samples inevitably appear and may even accumulate\nduring the training processes. Those errors often lead to performance\ndegradation and model instability. To mitigate the impact of label uncertainty\nand improve the robustness of learning with positive and unlabeled data, we\npropose a new robust PU learning method with a training strategy motivated by\nthe nature of human learning: easy cases should be learned first. Similar\nintuition has been utilized in curriculum learning to only use easier cases in\nthe early stage of training before introducing more complex cases.\nSpecifically, we utilize a novel ``hardness'' measure to distinguish unlabeled\nsamples with a high chance of being negative from unlabeled samples with large\nlabel noise. An iterative training strategy is then implemented to fine-tune\nthe selection of negative samples during the training process in an iterative\nmanner to include more ``easy'' samples in the early stage of training.\nExtensive experimental validations over a wide range of learning tasks show\nthat this approach can effectively improve the accuracy and stability of\nlearning with positive and unlabeled data. Our code is available at\nhttps://github.com/woriazzc/Robust-PU\n","authors":["Zhangchi Zhu","Lu Wang","Pu Zhao","Chao Du","Wei Zhang","Hang Dong","Bo Qiao","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00279v1.pdf","comment":"Accepted at KDD2023"},{"id":"http://arxiv.org/abs/2308.00265v1","updated":"2023-08-01T03:56:50Z","published":"2023-08-01T03:56:50Z","title":"Benchmarking Ultra-High-Definition Image Reflection Removal","summary":"  Deep learning based methods have achieved significant success in the task of\nsingle image reflection removal (SIRR). However, the majority of these methods\nare focused on High-Definition/Standard-Definition (HD/SD) images, while\nignoring higher resolution images such as Ultra-High-Definition (UHD) images.\nWith the increasing prevalence of UHD images captured by modern devices, in\nthis paper, we aim to address the problem of UHD SIRR. Specifically, we first\nsynthesize two large-scale UHD datasets, UHDRR4K and UHDRR8K. The UHDRR4K\ndataset consists of $2,999$ and $168$ quadruplets of images for training and\ntesting respectively, and the UHDRR8K dataset contains $1,014$ and $105$\nquadruplets. To the best of our knowledge, these two datasets are the first\nlargest-scale UHD datasets for SIRR. Then, we conduct a comprehensive\nevaluation of six state-of-the-art SIRR methods using the proposed datasets.\nBased on the results, we provide detailed discussions regarding the strengths\nand limitations of these methods when applied to UHD images. Finally, we\npresent a transformer-based architecture named RRFormer for reflection removal.\nRRFormer comprises three modules, namely the Prepossessing Embedding Module,\nSelf-attention Feature Extraction Module, and Multi-scale Spatial Feature\nExtraction Module. These modules extract hypercolumn features, global and\npartial attention features, and multi-scale spatial features, respectively. To\nensure effective training, we utilize three terms in our loss function: pixel\nloss, feature loss, and adversarial loss. We demonstrate through experimental\nresults that RRFormer achieves state-of-the-art performance on both the non-UHD\ndataset and our proposed UHDRR datasets. The code and datasets are publicly\navailable at\nhttps://github.com/Liar-zzy/Benchmarking-Ultra-High-Definition-Single-Image-Reflection-Removal.\n","authors":["Zhenyuan Zhang","Zhenbo Song","Kaihao Zhang","Wenhan Luo","Zhaoxin Fan","Jianfeng Lu"],"pdf_url":"https://arxiv.org/pdf/2308.00265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00262v1","updated":"2023-08-01T03:46:59Z","published":"2023-08-01T03:46:59Z","title":"The Algonauts Project 2023 Challenge: UARK-UAlbany Team Solution","summary":"  This work presents our solutions to the Algonauts Project 2023 Challenge. The\nprimary objective of the challenge revolves around employing computational\nmodels to anticipate brain responses captured during participants' observation\nof intricate natural visual scenes. The goal is to predict brain responses\nacross the entire visual brain, as it is the region where the most reliable\nresponses to images have been observed. We constructed an image-based brain\nencoder through a two-step training process to tackle this challenge.\nInitially, we created a pretrained encoder using data from all subjects. Next,\nwe proceeded to fine-tune individual subjects. Each step employed different\ntraining strategies, such as different loss functions and objectives, to\nintroduce diversity. Ultimately, our solution constitutes an ensemble of\nmultiple unique encoders. The code is available at\nhttps://github.com/uark-cviu/Algonauts2023\n","authors":["Xuan-Bac Nguyen","Xudong Liu","Xin Li","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2308.00262v1.pdf","comment":"The Algonauts Project 2023 Challenge"},{"id":"http://arxiv.org/abs/2112.00337v2","updated":"2023-08-01T03:45:57Z","published":"2021-12-01T08:07:01Z","title":"A Unified Benchmark for the Unknown Detection Capability of Deep Neural\n  Networks","summary":"  Deep neural networks have achieved outstanding performance over various\ntasks, but they have a critical issue: over-confident predictions even for\ncompletely unknown samples. Many studies have been proposed to successfully\nfilter out these unknown samples, but they only considered narrow and specific\ntasks, referred to as misclassification detection, open-set recognition, or\nout-of-distribution detection. In this work, we argue that these tasks should\nbe treated as fundamentally an identical problem because an ideal model should\npossess detection capability for all those tasks. Therefore, we introduce the\nunknown detection task, an integration of previous individual tasks, for a\nrigorous examination of the detection capability of deep neural networks on a\nwide spectrum of unknown samples. To this end, unified benchmark datasets on\ndifferent scales were constructed and the unknown detection capabilities of\nexisting popular methods were subject to comparison. We found that Deep\nEnsemble consistently outperforms the other approaches in detecting unknowns;\nhowever, all methods are only successful for a specific type of unknown. The\nreproducible code and benchmark datasets are available at\nhttps://github.com/daintlab/unknown-detection-benchmarks .\n","authors":["Jihyo Kim","Jiin Koo","Sangheum Hwang"],"pdf_url":"https://arxiv.org/pdf/2112.00337v2.pdf","comment":"Published in ESWA\n  (https://www.sciencedirect.com/science/article/pii/S0957417423009636)"},{"id":"http://arxiv.org/abs/2308.00261v1","updated":"2023-08-01T03:44:56Z","published":"2023-08-01T03:44:56Z","title":"Improving Pixel-based MIM by Reducing Wasted Modeling Capability","summary":"  There has been significant progress in Masked Image Modeling (MIM). Existing\nMIM methods can be broadly categorized into two groups based on the\nreconstruction target: pixel-based and tokenizer-based approaches. The former\noffers a simpler pipeline and lower computational cost, but it is known to be\nbiased toward high-frequency details. In this paper, we provide a set of\nempirical studies to confirm this limitation of pixel-based MIM and propose a\nnew method that explicitly utilizes low-level features from shallow layers to\naid pixel reconstruction. By incorporating this design into our base method,\nMAE, we reduce the wasted modeling capability of pixel-based MIM, improving its\nconvergence and achieving non-trivial improvements across various downstream\ntasks. To the best of our knowledge, we are the first to systematically\ninvestigate multi-level feature fusion for isotropic architectures like the\nstandard Vision Transformer (ViT). Notably, when applied to a smaller model\n(e.g., ViT-S), our method yields significant performance gains, such as 1.2\\%\non fine-tuning, 2.8\\% on linear probing, and 2.6\\% on semantic segmentation.\nCode and models are available at https://github.com/open-mmlab/mmpretrain.\n","authors":["Yuan Liu","Songyang Zhang","Jiacheng Chen","Zhaohui Yu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.00261v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.00255v1","updated":"2023-08-01T03:35:32Z","published":"2023-08-01T03:35:32Z","title":"LGViT: Dynamic Early Exiting for Accelerating Vision Transformer","summary":"  Recently, the efficient deployment and acceleration of powerful vision\ntransformers (ViTs) on resource-limited edge devices for providing multimedia\nservices have become attractive tasks. Although early exiting is a feasible\nsolution for accelerating inference, most works focus on convolutional neural\nnetworks (CNNs) and transformer models in natural language processing\n(NLP).Moreover, the direct application of early exiting methods to ViTs may\nresult in substantial performance degradation. To tackle this challenge, we\nsystematically investigate the efficacy of early exiting in ViTs and point out\nthat the insufficient feature representations in shallow internal classifiers\nand the limited ability to capture target semantic information in deep internal\nclassifiers restrict the performance of these methods. We then propose an early\nexiting framework for general ViTs termed LGViT, which incorporates\nheterogeneous exiting heads, namely, local perception head and global\naggregation head, to achieve an efficiency-accuracy trade-off. In particular,\nwe develop a novel two-stage training scheme, including end-to-end training and\nself-distillation with the backbone frozen to generate early exiting ViTs,\nwhich facilitates the fusion of global and local information extracted by the\ntwo types of heads. We conduct extensive experiments using three popular ViT\nbackbones on three vision datasets. Results demonstrate that our LGViT can\nachieve competitive performance with approximately 1.8 $\\times$ speed-up.\n","authors":["Guanyu Xu","Jiawei Hao","Li Shen","Han Hu","Yong Luo","Hui Lin","Jialie Shen"],"pdf_url":"https://arxiv.org/pdf/2308.00255v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.10830v4","updated":"2023-08-01T03:22:22Z","published":"2023-06-19T10:27:24Z","title":"3D VR Sketch Guided 3D Shape Prototyping and Exploration","summary":"  3D shape modeling is labor-intensive, time-consuming, and requires years of\nexpertise. To facilitate 3D shape modeling, we propose a 3D shape generation\nnetwork that takes a 3D VR sketch as a condition. We assume that sketches are\ncreated by novices without art training and aim to reconstruct geometrically\nrealistic 3D shapes of a given category. To handle potential sketch ambiguity,\nour method creates multiple 3D shapes that align with the original sketch's\nstructure. We carefully design our method, training the model step-by-step and\nleveraging multi-modal 3D shape representation to support training with limited\ntraining data. To guarantee the realism of generated 3D shapes we leverage the\nnormalizing flow that models the distribution of the latent space of 3D shapes.\nTo encourage the fidelity of the generated 3D shapes to an input sketch, we\npropose a dedicated loss that we deploy at different stages of the training\nprocess. The code is available at https://github.com/Rowl1ng/3Dsketch2shape.\n","authors":["Ling Luo","Pinaki Nath Chowdhury","Tao Xiang","Yi-Zhe Song","Yulia Gryaditskaya"],"pdf_url":"https://arxiv.org/pdf/2306.10830v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00247v1","updated":"2023-08-01T03:00:36Z","published":"2023-08-01T03:00:36Z","title":"Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive\n  Review","summary":"  The advent of deep learning has brought a revolutionary transformation to\nimage denoising techniques. However, the persistent challenge of acquiring\nnoise-clean pairs for supervised methods in real-world scenarios remains\nformidable, necessitating the exploration of more practical self-supervised\nimage denoising. This paper focuses on self-supervised image denoising methods\nthat offer effective solutions to address this challenge. Our comprehensive\nreview thoroughly analyzes the latest advancements in self-supervised image\ndenoising approaches, categorizing them into three distinct classes: General\nmethods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.\nFor each class, we provide a concise theoretical analysis along with their\npractical applications. To assess the effectiveness of these methods, we\npresent both quantitative and qualitative experimental results on various\ndatasets, utilizing classical algorithms as benchmarks. Additionally, we\ncritically discuss the current limitations of these methods and propose\npromising directions for future research. By offering a detailed overview of\nrecent developments in self-supervised image denoising, this review serves as\nan invaluable resource for researchers and practitioners in the field,\nfacilitating a deeper understanding of this emerging domain and inspiring\nfurther advancements.\n","authors":["Dan Zhang","Fangfang Zhou","Yuanzhou Wei","Xiao Yang","Yuan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.00247v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2305.04224v2","updated":"2023-08-01T02:46:43Z","published":"2023-05-07T09:05:19Z","title":"Visual Causal Scene Refinement for Video Question Answering","summary":"  Existing methods for video question answering (VideoQA) often suffer from\nspurious correlations between different modalities, leading to a failure in\nidentifying the dominant visual evidence and the intended question. Moreover,\nthese methods function as black boxes, making it difficult to interpret the\nvisual scene during the QA process. In this paper, to discover critical video\nsegments and frames that serve as the visual causal scene for generating\nreliable answers, we present a causal analysis of VideoQA and propose a\nframework for cross-modal causal relational reasoning, named Visual Causal\nScene Refinement (VCSR). Particularly, a set of causal front-door intervention\noperations is introduced to explicitly find the visual causal scenes at both\nsegment and frame levels. Our VCSR involves two essential modules: i) the\nQuestion-Guided Refiner (QGR) module, which refines consecutive video frames\nguided by the question semantics to obtain more representative segment features\nfor causal front-door intervention; ii) the Causal Scene Separator (CSS)\nmodule, which discovers a collection of visual causal and non-causal scenes\nbased on the visual-linguistic causal relevance and estimates the causal effect\nof the scene-separating intervention in a contrastive learning manner.\nExtensive experiments on the NExT-QA, Causal-VidQA, and MSRVTT-QA datasets\ndemonstrate the superiority of our VCSR in discovering visual causal scene and\nachieving robust video question answering. The code is available at\nhttps://github.com/YangLiu9208/VCSR.\n","authors":["Yushen Wei","Yang Liu","Hong Yan","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.04224v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.00236v1","updated":"2023-08-01T02:33:10Z","published":"2023-08-01T02:33:10Z","title":"Partitioned Saliency Ranking with Dense Pyramid Transformers","summary":"  In recent years, saliency ranking has emerged as a challenging task focusing\non assessing the degree of saliency at instance-level. Being subjective, even\nhumans struggle to identify the precise order of all salient instances.\nPrevious approaches undertake the saliency ranking by directly sorting the rank\nscores of salient instances, which have not explicitly resolved the inherent\nambiguities. To overcome this limitation, we propose the ranking by partition\nparadigm, which segments unordered salient instances into partitions and then\nranks them based on the correlations among these partitions. The ranking by\npartition paradigm alleviates ranking ambiguities in a general sense, as it\nconsistently improves the performance of other saliency ranking models.\nAdditionally, we introduce the Dense Pyramid Transformer (DPT) to enable global\ncross-scale interactions, which significantly enhances feature interactions\nwith reduced computational burden. Extensive experiments demonstrate that our\napproach outperforms all existing methods. The code for our method is available\nat \\url{https://github.com/ssecv/PSR}.\n","authors":["Chengxiao Sun","Yan Xu","Jialun Pei","Haopeng Fang","He Tang"],"pdf_url":"https://arxiv.org/pdf/2308.00236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02096v2","updated":"2023-08-01T02:08:12Z","published":"2022-12-05T08:25:09Z","title":"FBLNet: FeedBack Loop Network for Driver Attention Prediction","summary":"  The problem of predicting driver attention from the driving perspective is\ngaining increasing research focus due to its remarkable significance for\nautonomous driving and assisted driving systems. The driving experience is\nextremely important for safe driving,a skilled driver is able to effortlessly\npredict oncoming danger (before it becomes salient) based on the driving\nexperience and quickly pay attention to the corresponding zones.However, the\nnonobjective driving experience is difficult to model, so a mechanism\nsimulating the driver experience accumulation procedure is absent in existing\nmethods, and the current methods usually follow the technique line of saliency\nprediction methods to predict driver attention. In this paper, we propose a\nFeedBack Loop Network (FBLNet), which attempts to model the driving experience\naccumulation procedure. By over-and-over iterations, FBLNet generates the\nincremental knowledge that carries rich historically-accumulative and long-term\ntemporal information. The incremental knowledge in our model is like the\ndriving experience of humans. Under the guidance of the incremental knowledge,\nour model fuses the CNN feature and Transformer feature that are extracted from\nthe input image to predict driver attention. Our model exhibits a solid\nadvantage over existing methods, achieving an outstanding performance\nimprovement on two driver attention benchmark datasets.\n","authors":["Yilong Chen","Zhixiong Nan","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2212.02096v2.pdf","comment":"8 figures"},{"id":"http://arxiv.org/abs/2307.01864v2","updated":"2023-08-01T02:04:10Z","published":"2023-07-04T18:22:00Z","title":"MaskBEV: Joint Object Detection and Footprint Completion for Bird's-eye\n  View 3D Point Clouds","summary":"  Recent works in object detection in LiDAR point clouds mostly focus on\npredicting bounding boxes around objects. This prediction is commonly achieved\nusing anchor-based or anchor-free detectors that predict bounding boxes,\nrequiring significant explicit prior knowledge about the objects to work\nproperly. To remedy these limitations, we propose MaskBEV, a bird's-eye view\n(BEV) mask-based object detector neural architecture. MaskBEV predicts a set of\nBEV instance masks that represent the footprints of detected objects. Moreover,\nour approach allows object detection and footprint completion in a single pass.\nMaskBEV also reformulates the detection problem purely in terms of\nclassification, doing away with regression usually done to predict bounding\nboxes. We evaluate the performance of MaskBEV on both SemanticKITTI and KITTI\ndatasets while analyzing the architecture advantages and limitations.\n","authors":["William Guimont-Martin","Jean-Michel Fortin","François Pomerleau","Philippe Giguère"],"pdf_url":"https://arxiv.org/pdf/2307.01864v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2308.00228v1","updated":"2023-08-01T01:54:55Z","published":"2023-08-01T01:54:55Z","title":"Using Scene and Semantic Features for Multi-modal Emotion Recognition","summary":"  Automatic emotion recognition is a hot topic with a wide range of\napplications. Much work has been done in the area of automatic emotion\nrecognition in recent years. The focus has been mainly on using the\ncharacteristics of a person such as speech, facial expression and pose for this\npurpose. However, the processing of scene and semantic features for emotion\nrecognition has had limited exploration. In this paper, we propose to use\ncombined scene and semantic features, along with personal features, for\nmulti-modal emotion recognition. Scene features will describe the environment\nor context in which the target person is operating. The semantic feature can\ninclude objects that are present in the environment, as well as their\nattributes and relationships with the target person. In addition, we use a\nmodified EmbraceNet to extract features from the images, which is trained to\nlearn both the body and pose features simultaneously. By fusing both body and\npose features, the EmbraceNet can improve the accuracy and robustness of the\nmodel, particularly when dealing with partially missing data. This is because\nhaving both body and pose features provides a more complete representation of\nthe subject in the images, which can help the model to make more accurate\npredictions even when some parts of body are missing. We demonstrate the\nefficiency of our method on the benchmark EMOTIC dataset. We report an average\nprecision of 40.39\\% across the 26 emotion categories, which is a 5\\%\nimprovement over previous approaches.\n","authors":["Zhifeng Wang","Ramesh Sankaranarayana"],"pdf_url":"https://arxiv.org/pdf/2308.00228v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.16160v2","updated":"2023-08-01T01:48:25Z","published":"2023-07-30T08:06:11Z","title":"Motion Degeneracy in Self-supervised Learning of Elevation Angle\n  Estimation for 2D Forward-Looking Sonar","summary":"  2D forward-looking sonar is a crucial sensor for underwater robotic\nperception. A well-known problem in this field is estimating missing\ninformation in the elevation direction during sonar imaging. There are demands\nto estimate 3D information per image for 3D mapping and robot navigation during\nfly-through missions. Recent learning-based methods have demonstrated their\nstrengths, but there are still drawbacks. Supervised learning methods have\nachieved high-quality results but may require further efforts to acquire 3D\nground-truth labels. The existing self-supervised method requires pretraining\nusing synthetic images with 3D supervision. This study aims to realize stable\nself-supervised learning of elevation angle estimation without pretraining\nusing synthetic images. Failures during self-supervised learning may be caused\nby motion degeneracy problems. We first analyze the motion field of 2D\nforward-looking sonar, which is related to the main supervision signal. We\nutilize a modern learning framework and prove that if the training dataset is\nbuilt with effective motions, the network can be trained in a self-supervised\nmanner without the knowledge of synthetic data. Both simulation and real\nexperiments validate the proposed method.\n","authors":["Yusheng Wang","Yonghoon Ji","Chujie Wu","Hiroshi Tsuchiya","Hajime Asama","Atsushi Yamashita"],"pdf_url":"https://arxiv.org/pdf/2307.16160v2.pdf","comment":"IROS2023"},{"id":"http://arxiv.org/abs/2307.16620v2","updated":"2023-08-01T01:40:17Z","published":"2023-07-31T12:56:30Z","title":"Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics","summary":"  The audio-visual segmentation (AVS) task aims to segment sounding objects\nfrom a given video. Existing works mainly focus on fusing audio and visual\nfeatures of a given video to achieve sounding object masks. However, we\nobserved that prior arts are prone to segment a certain salient object in a\nvideo regardless of the audio information. This is because sounding objects are\noften the most salient ones in the AVS dataset. Thus, current AVS methods might\nfail to localize genuine sounding objects due to the dataset bias. In this\nwork, we present an audio-visual instance-aware segmentation approach to\novercome the dataset bias. In a nutshell, our method first localizes potential\nsounding objects in a video by an object segmentation network, and then\nassociates the sounding object candidates with the given audio. We notice that\nan object could be a sounding object in one video but a silent one in another\nvideo. This would bring ambiguity in training our object segmentation network\nas only sounding objects have corresponding segmentation masks. We thus propose\na silent object-aware segmentation objective to alleviate the ambiguity.\nMoreover, since the category information of audio is unknown, especially for\nmultiple sounding sources, we propose to explore the audio-visual semantic\ncorrelation and then associate audio with potential objects. Specifically, we\nattend predicted audio category scores to potential instance masks and these\nscores will highlight corresponding sounding instances while suppressing\ninaudible ones. When we enforce the attended instance masks to resemble the\nground-truth mask, we are able to establish audio-visual semantics correlation.\nExperimental results on the AVS benchmarks demonstrate that our method can\neffectively segment sounding objects without being biased to salient objects.\n","authors":["Chen Liu","Peike Li","Xingqun Qi","Hu Zhang","Lincheng Li","Dadong Wang","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2307.16620v2.pdf","comment":"This paper has been received by ACM MM 23"},{"id":"http://arxiv.org/abs/2308.00220v1","updated":"2023-08-01T01:27:34Z","published":"2023-08-01T01:27:34Z","title":"Boundary Difference Over Union Loss For Medical Image Segmentation","summary":"  Medical image segmentation is crucial for clinical diagnosis. However,\ncurrent losses for medical image segmentation mainly focus on overall\nsegmentation results, with fewer losses proposed to guide boundary\nsegmentation. Those that do exist often need to be used in combination with\nother losses and produce ineffective results. To address this issue, we have\ndeveloped a simple and effective loss called the Boundary Difference over Union\nLoss (Boundary DoU Loss) to guide boundary region segmentation. It is obtained\nby calculating the ratio of the difference set of prediction and ground truth\nto the union of the difference set and the partial intersection set. Our loss\nonly relies on region calculation, making it easy to implement and training\nstable without needing any additional losses. Additionally, we use the target\nsize to adaptively adjust attention applied to the boundary regions.\nExperimental results using UNet, TransUNet, and Swin-UNet on two datasets (ACDC\nand Synapse) demonstrate the effectiveness of our proposed loss function. Code\nis available at https://github.com/sunfan-bvb/BoundaryDoULoss.\n","authors":["Fan Sun","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2308.00220v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.00219v1","updated":"2023-08-01T01:26:55Z","published":"2023-08-01T01:26:55Z","title":"Multi-goal Audio-visual Navigation using Sound Direction Map","summary":"  Over the past few years, there has been a great deal of research on\nnavigation tasks in indoor environments using deep reinforcement learning\nagents. Most of these tasks use only visual information in the form of\nfirst-person images to navigate to a single goal. More recently, tasks that\nsimultaneously use visual and auditory information to navigate to the sound\nsource and even navigation tasks with multiple goals instead of one have been\nproposed. However, there has been no proposal for a generalized navigation task\ncombining these two types of tasks and using both visual and auditory\ninformation in a situation where multiple sound sources are goals. In this\npaper, we propose a new framework for this generalized task: multi-goal\naudio-visual navigation. We first define the task in detail, and then we\ninvestigate the difficulty of the multi-goal audio-visual navigation task\nrelative to the current navigation tasks by conducting experiments in various\nsituations. The research shows that multi-goal audio-visual navigation has the\ndifficulty of the implicit need to separate the sources of sound. Next, to\nmitigate the difficulties in this new task, we propose a method named sound\ndirection map (SDM), which dynamically localizes multiple sound sources in a\nlearning-based manner while making use of past memories. Experimental results\nshow that the use of SDM significantly improves the performance of multiple\nbaseline methods, regardless of the number of goals.\n","authors":["Haru Kondoh","Asako Kanezaki"],"pdf_url":"https://arxiv.org/pdf/2308.00219v1.pdf","comment":"IROS2023"},{"id":"http://arxiv.org/abs/2307.16143v2","updated":"2023-08-01T01:18:10Z","published":"2023-07-30T06:43:09Z","title":"Structure-Preserving Synthesis: MaskGAN for Unpaired MR-CT Translation","summary":"  Medical image synthesis is a challenging task due to the scarcity of paired\ndata. Several methods have applied CycleGAN to leverage unpaired data, but they\noften generate inaccurate mappings that shift the anatomy. This problem is\nfurther exacerbated when the images from the source and target modalities are\nheavily misaligned. Recently, current methods have aimed to address this issue\nby incorporating a supplementary segmentation network. Unfortunately, this\nstrategy requires costly and time-consuming pixel-level annotations. To\novercome this problem, this paper proposes MaskGAN, a novel and cost-effective\nframework that enforces structural consistency by utilizing automatically\nextracted coarse masks. Our approach employs a mask generator to outline\nanatomical structures and a content generator to synthesize CT contents that\nalign with these structures. Extensive experiments demonstrate that MaskGAN\noutperforms state-of-the-art synthesis methods on a challenging pediatric\ndataset, where MR and CT scans are heavily misaligned due to rapid growth in\nchildren. Specifically, MaskGAN excels in preserving anatomical structures\nwithout the need for expert annotations. The code for this paper can be found\nat https://github.com/HieuPhan33/MaskGAN.\n","authors":["Minh Hieu Phan","Zhibin Liao","Johan W. Verjans","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2307.16143v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.00214v1","updated":"2023-08-01T01:12:29Z","published":"2023-08-01T01:12:29Z","title":"Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned\n  Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF)","summary":"  Many tasks performed in image-guided, mini-invasive, medical procedures can\nbe cast as pose estimation problems, where an X-ray projection is utilized to\nreach a target in 3D space. Recent advances in the differentiable rendering of\noptically reflective materials have enabled state-of-the-art performance in RGB\ncamera view synthesis and pose estimation. Expanding on these prior works, we\nintroduce new methods for pose estimation of radiolucent objects using X-ray\nprojections, and we demonstrate the critical role of optimal view synthesis in\nperforming this task. We first develop an algorithm (DiffDRR) that efficiently\ncomputes Digitally Reconstructed Radiographs (DRRs) and leverages automatic\ndifferentiation within TensorFlow. In conjunction with classic CBCT\nreconstruction algorithms, we perform pose estimation by gradient descent using\na loss function that quantifies the similarity of the DRR synthesized from a\nrandomly initialized pose and the true fluoroscopic image at the target pose.\nWe propose two novel methods for high-fidelity view synthesis, Neural Tuned\nTomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely\non classic CBCT; NeTT directly optimizes the CBCT densities, while the non-zero\nvalues of mNeRF are constrained by a 3D mask of the anatomic region segmented\nfrom CBCT. We demonstrate that both NeTT and mNeRF distinctly improve pose\nestimation within our framework. By defining a successful pose estimate to be a\n3D angle error of less than 3 deg, we find that NeTT and mNeRF can achieve\nsimilar results, both with overall success rates more than 93%. Furthermore, we\nshow that a NeTT trained for a single subject can generalize to synthesize\nhigh-fidelity DRRs and ensure robust pose estimations for all other subjects.\nTherefore, we suggest that NeTT is an attractive option for robust pose\nestimation using fluoroscopic projections.\n","authors":["Chaochao Zhou","Syed Hasib Akhter Faruqui","Abhinav Patel","Ramez N. Abdalla","Michael C. Hurley","Ali Shaibani","Matthew B. Potts","Babak S. Jahromi","Leon Cho","Sameer A. Ansari","Donald R. Cantrell"],"pdf_url":"https://arxiv.org/pdf/2308.00214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00210v1","updated":"2023-08-01T00:53:05Z","published":"2023-08-01T00:53:05Z","title":"Scene Separation & Data Selection: Temporal Segmentation Algorithm for\n  Real-Time Video Stream Analysis","summary":"  We present 2SDS (Scene Separation and Data Selection algorithm), a temporal\nsegmentation algorithm used in real-time video stream interpretation. It\ncomplements CNN-based models to make use of temporal information in videos.\n2SDS can detect the change between scenes in a video stream by com-paring the\nimage difference between two frames. It separates a video into segments\n(scenes), and by combining itself with a CNN model, 2SDS can select the optimal\nresult for each scene. In this paper, we will be discussing some basic methods\nand concepts behind 2SDS, as well as presenting some preliminary experiment\nresults regarding 2SDS. During these experiments, 2SDS has achieved an overall\naccuracy of over 90%.\n","authors":["Yuelin Xin","Zihan Zhou","Yuxuan Xia"],"pdf_url":"https://arxiv.org/pdf/2308.00210v1.pdf","comment":"5 pages, 4 figures, at IJCAI-ECAI 2022 workshop, First International\n  Workshop on Spatio-Temporal Reasoning and Learning, July 24, 2022, Vienna,\n  Austria"},{"id":"http://arxiv.org/abs/2307.07181v3","updated":"2023-08-01T00:45:24Z","published":"2023-07-14T06:21:03Z","title":"DISPEL: Domain Generalization via Domain-Specific Liberating","summary":"  Domain generalization aims to learn a generalization model that can perform\nwell on unseen test domains by only training on limited source domains.\nHowever, existing domain generalization approaches often bring in\nprediction-irrelevant noise or require the collection of domain labels. To\naddress these challenges, we consider the domain generalization problem from a\ndifferent perspective by categorizing underlying feature groups into\ndomain-shared and domain-specific features. Nevertheless, the domain-specific\nfeatures are difficult to be identified and distinguished from the input data.\nIn this work, we propose DomaIn-SPEcific Liberating (DISPEL), a post-processing\nfine-grained masking approach that can filter out undefined and\nindistinguishable domain-specific features in the embedding space.\nSpecifically, DISPEL utilizes a mask generator that produces a unique mask for\neach input data to filter domain-specific features. The DISPEL framework is\nhighly flexible to be applied to any fine-tuned models. We derive a\ngeneralization error bound to guarantee the generalization performance by\noptimizing a designed objective loss. The experimental results on five\nbenchmarks demonstrate DISPEL outperforms existing methods and can further\ngeneralize various algorithms.\n","authors":["Chia-Yuan Chang","Yu-Neng Chuang","Guanchu Wang","Mengnan Du","Na Zou"],"pdf_url":"https://arxiv.org/pdf/2307.07181v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.01072v2","updated":"2023-08-01T22:44:35Z","published":"2022-09-02T14:07:25Z","title":"Occlusion-Resistant LiDAR Fiducial Marker Detection","summary":"  The LiDAR fiducial marker, akin to the well-known AprilTag used in camera\napplications, serves as a convenient resource to impart artificial features to\nthe LiDAR sensor, facilitating robotics applications. Unfortunately, current\nLiDAR fiducial marker detection methods are limited to occlusion-free point\nclouds. In this work, we present a novel approach for occlusion-resistant LiDAR\nfiducial marker detection. We first extract 3D points potentially corresponding\nto the markers, leveraging the 3D intensity gradients. Afterward, we analyze\nthe 3D spatial distribution of the extracted points through clustering.\nSubsequently, we determine the potential marker locations by examining the\ngeometric characteristics of these clusters. We then successively transfer the\n3D points that fall within the candidate locations from the raw point cloud\nonto a designed intermediate plane. Finally, using the intermediate plane, we\nvalidate each location for the presence of a fiducial marker and compute the\nmarker's pose if found. We conduct both qualitative and quantitative\nexperiments to demonstrate that our approach is the first LiDAR fiducial marker\ndetection method applicable to point clouds with occlusion while achieving\nbetter accuracy.\n","authors":["Yibo Liu","Jinjun Shan","Hunter Schofield"],"pdf_url":"https://arxiv.org/pdf/2209.01072v2.pdf","comment":"7 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.00854v1","updated":"2023-08-01T21:40:30Z","published":"2023-08-01T21:40:30Z","title":"Training on Foveated Images Improves Robustness to Adversarial Attacks","summary":"  Deep neural networks (DNNs) have been shown to be vulnerable to adversarial\nattacks -- subtle, perceptually indistinguishable perturbations of inputs that\nchange the response of the model. In the context of vision, we hypothesize that\nan important contributor to the robustness of human visual perception is\nconstant exposure to low-fidelity visual stimuli in our peripheral vision. To\ninvestigate this hypothesis, we develop \\RBlur, an image transform that\nsimulates the loss in fidelity of peripheral vision by blurring the image and\nreducing its color saturation based on the distance from a given fixation\npoint. We show that compared to DNNs trained on the original images, DNNs\ntrained on images transformed by \\RBlur are substantially more robust to\nadversarial attacks, as well as other, non-adversarial, corruptions, achieving\nup to 25\\% higher accuracy on perturbed data.\n","authors":["Muhammad A. Shah","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2308.00854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13399v5","updated":"2023-08-01T21:01:04Z","published":"2023-05-22T18:25:03Z","title":"Efficient Large-Scale Visual Representation Learning And Evaluation","summary":"  Efficiently learning visual representations of items is vital for large-scale\nrecommendations. In this article we compare several pretrained efficient\nbackbone architectures, both in the convolutional neural network (CNN) and in\nthe vision transformer (ViT) family. We describe challenges in e-commerce\nvision applications at scale and highlight methods to efficiently train,\nevaluate, and serve visual representations. We present ablation studies\nevaluating visual representations in several downstream tasks. To this end, we\npresent a novel multilingual text-to-image generative offline evaluation method\nfor visually similar recommendation systems. Finally, we include online results\nfrom deployed machine learning systems in production on a large scale\ne-commerce platform.\n","authors":["Eden Dolev","Alaa Awad","Denisa Roberts","Zahra Ebrahimzadeh","Marcin Mejran","Vaibhav Malpani","Mahir Yavuz"],"pdf_url":"https://arxiv.org/pdf/2305.13399v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00828v1","updated":"2023-08-01T20:30:11Z","published":"2023-08-01T20:30:11Z","title":"Deep Learning Approaches in Pavement Distress Identification: A Review","summary":"  This paper presents a comprehensive review of recent advancements in image\nprocessing and deep learning techniques for pavement distress detection and\nclassification, a critical aspect in modern pavement management systems. The\nconventional manual inspection process conducted by human experts is gradually\nbeing superseded by automated solutions, leveraging machine learning and deep\nlearning algorithms to enhance efficiency and accuracy. The ability of these\nalgorithms to discern patterns and make predictions based on extensive datasets\nhas revolutionized the domain of pavement distress identification. The paper\ninvestigates the integration of unmanned aerial vehicles (UAVs) for data\ncollection, offering unique advantages such as aerial perspectives and\nefficient coverage of large areas. By capturing high-resolution images, UAVs\nprovide valuable data that can be processed using deep learning algorithms to\ndetect and classify various pavement distresses effectively. While the primary\nfocus is on 2D image processing, the paper also acknowledges the challenges\nassociated with 3D images, such as sensor limitations and computational\nrequirements. Understanding these challenges is crucial for further\nadvancements in the field. The findings of this review significantly contribute\nto the evolution of pavement distress detection, fostering the development of\nefficient pavement management systems. As automated approaches continue to\nmature, the implementation of deep learning techniques holds great promise in\nensuring safer and more durable road infrastructure for the benefit of society.\n","authors":["Sizhe Guan","Haolan Liu","Hamid R. Pourreza","Hamidreza Mahyar"],"pdf_url":"https://arxiv.org/pdf/2308.00828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00806v1","updated":"2023-08-01T19:44:31Z","published":"2023-08-01T19:44:31Z","title":"Addressing Uncertainty in Imbalanced Histopathology Image Classification\n  of HER2 Breast Cancer: An interpretable Ensemble Approach with Threshold\n  Filtered Single Instance Evaluation (SIE)","summary":"  Breast Cancer (BC) is among women's most lethal health concerns. Early\ndiagnosis can alleviate the mortality rate by helping patients make efficient\ntreatment decisions. Human Epidermal Growth Factor Receptor (HER2) has become\none the most lethal subtype of BC. According to the College of American\nPathologists/American Society of Clinical Oncology (CAP/ASCO), the severity\nlevel of HER2 expression can be classified between 0 and 3+ range. HER2 can be\ndetected effectively from immunohistochemical (IHC) and, hematoxylin \\& eosin\n(HE) images of different classes such as 0, 1+, 2+, and 3+. An ensemble\napproach integrated with threshold filtered single instance evaluation (SIE)\ntechnique has been proposed in this study to diagnose BC from the\nmulti-categorical expression of HER2 subtypes. Initially, DenseNet201 and\nXception have been ensembled into a single classifier as feature extractors\nwith an effective combination of global average pooling, dropout layer, dense\nlayer with a swish activation function, and l2 regularizer, batch\nnormalization, etc. After that, extracted features has been processed through\nsingle instance evaluation (SIE) to determine different confidence levels and\nadjust decision boundary among the imbalanced classes. This study has been\nconducted on the BC immunohistochemical (BCI) dataset, which is classified by\npathologists into four stages of HER2 BC. This proposed approach known as\nDenseNet201-Xception-SIE with a threshold value of 0.7 surpassed all other\nexisting state-of-art models with an accuracy of 97.12\\%, precision of 97.15\\%,\nand recall of 97.68\\% on H\\&E data and, accuracy of 97.56\\%, precision of\n97.57\\%, and recall of 98.00\\% on IHC data respectively, maintaining momentous\nimprovement. Finally, Grad-CAM and Guided Grad-CAM have been employed in this\nstudy to interpret, how TL-based model works on the histopathology dataset and\nmake decisions from the data.\n","authors":["Md Sakib Hossain Shovon","M. F. Mridha","Khan Md Hasib","Sultan Alfarhood","Mejdl Safran","Dunren Che"],"pdf_url":"https://arxiv.org/pdf/2308.00806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03651v2","updated":"2023-08-01T19:41:49Z","published":"2023-03-07T04:58:57Z","title":"F2BEV: Bird's Eye View Generation from Surround-View Fisheye Camera\n  Images for Automated Driving","summary":"  Bird's Eye View (BEV) representations are tremendously useful for\nperception-related automated driving tasks. However, generating BEVs from\nsurround-view fisheye camera images is challenging due to the strong\ndistortions introduced by such wide-angle lenses. We take the first step in\naddressing this challenge and introduce a baseline, F2BEV, to generate\ndiscretized BEV height maps and BEV semantic segmentation maps from fisheye\nimages. F2BEV consists of a distortion-aware spatial cross attention module for\nquerying and consolidating spatial information from fisheye image features in a\ntransformer-style architecture followed by a task-specific head. We evaluate\nsingle-task and multi-task variants of F2BEV on our synthetic FB-SSEM dataset,\nall of which generate better BEV height and segmentation maps (in terms of the\nIoU) than a state-of-the-art BEV generation method operating on undistorted\nfisheye images. We also demonstrate discretized height map generation from\nreal-world fisheye images using F2BEV. Our dataset is publicly available at\nhttps://github.com/volvo-cars/FB-SSEM-dataset\n","authors":["Ekta U. Samani","Feng Tao","Harshavardhan R. Dasari","Sihao Ding","Ashis G. Banerjee"],"pdf_url":"https://arxiv.org/pdf/2303.03651v2.pdf","comment":"Accepted for publication in the proceedings of IEEE/RSJ International\n  Conference on Intelligent Robots and Systems 2023"},{"id":"http://arxiv.org/abs/2308.00799v1","updated":"2023-08-01T19:29:10Z","published":"2023-08-01T19:29:10Z","title":"Body Knowledge and Uncertainty Modeling for Monocular 3D Human Body\n  Reconstruction","summary":"  While 3D body reconstruction methods have made remarkable progress recently,\nit remains difficult to acquire the sufficiently accurate and numerous 3D\nsupervisions required for training. In this paper, we propose \\textbf{KNOWN}, a\nframework that effectively utilizes body \\textbf{KNOW}ledge and\nu\\textbf{N}certainty modeling to compensate for insufficient 3D supervisions.\nKNOWN exploits a comprehensive set of generic body constraints derived from\nwell-established body knowledge. These generic constraints precisely and\nexplicitly characterize the reconstruction plausibility and enable 3D\nreconstruction models to be trained without any 3D data. Moreover, existing\nmethods typically use images from multiple datasets during training, which can\nresult in data noise (\\textit{e.g.}, inconsistent joint annotation) and data\nimbalance (\\textit{e.g.}, minority images representing unusual poses or\ncaptured from challenging camera views). KNOWN solves these problems through a\nnovel probabilistic framework that models both aleatoric and epistemic\nuncertainty. Aleatoric uncertainty is encoded in a robust Negative\nLog-Likelihood (NLL) training loss, while epistemic uncertainty is used to\nguide model refinement. Experiments demonstrate that KNOWN's body\nreconstruction outperforms prior weakly-supervised approaches, particularly on\nthe challenging minority images.\n","authors":["Yufei Zhang","Hanjing Wang","Jeffrey O. Kephart","Qiang Ji"],"pdf_url":"https://arxiv.org/pdf/2308.00799v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2207.02399v2","updated":"2023-08-01T19:28:32Z","published":"2022-07-06T01:53:26Z","title":"Learning Apparent Diffusion Coefficient Maps from Accelerated Radial\n  k-Space Diffusion-Weighted MRI in Mice using a Deep CNN-Transformer Model","summary":"  Purpose: To accelerate radially sampled diffusion weighted spin-echo\n(Rad-DW-SE) acquisition method for generating high quality apparent diffusion\ncoefficient (ADC) maps. Methods: A deep learning method was developed to\ngenerate accurate ADC maps from accelerated DWI data acquired with the\nRad-DW-SE method. The deep learning method integrates convolutional neural\nnetworks (CNNs) with vision transformers to generate high quality ADC maps from\naccelerated DWI data, regularized by a monoexponential ADC model fitting term.\nA model was trained on DWI data of 147 mice and evaluated on DWI data of 36\nmice, with acceleration factors of 4x and 8x compared to the original\nacquisition parameters. We have made our code publicly available at GitHub:\nhttps://github.com/ymli39/DeepADC-Net-Learning-Apparent-Diffusion-Coefficient-Maps,\nand our dataset can be downloaded at\nhttps://pennpancreaticcancerimagingresource.github.io/data.html. Results:\nAblation studies and experimental results have demonstrated that the proposed\ndeep learning model generates higher quality ADC maps from accelerated DWI data\nthan alternative deep learning methods under comparison when their performance\nis quantified in whole images as well as in regions of interest, including\ntumors, kidneys, and muscles. Conclusions: The deep learning method with\nintegrated CNNs and transformers provides an effective means to accurately\ncompute ADC maps from accelerated DWI data acquired with the Rad-DW-SE method.\n","authors":["Yuemeng Li","Miguel Romanello Joaquim","Stephen Pickup","Hee Kwon Song","Rong Zhou","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2207.02399v2.pdf","comment":"Accepted by Magnetic Resonance in Medicine"},{"id":"http://arxiv.org/abs/2308.00783v1","updated":"2023-08-01T18:53:24Z","published":"2023-08-01T18:53:24Z","title":"Hybrid-SORT: Weak Cues Matter for Online Multi-Object Tracking","summary":"  Multi-Object Tracking (MOT) aims to detect and associate all desired objects\nacross frames. Most methods accomplish the task by explicitly or implicitly\nleveraging strong cues (i.e., spatial and appearance information), which\nexhibit powerful instance-level discrimination. However, when object occlusion\nand clustering occur, both spatial and appearance information will become\nambiguous simultaneously due to the high overlap between objects. In this\npaper, we demonstrate that this long-standing challenge in MOT can be\nefficiently and effectively resolved by incorporating weak cues to compensate\nfor strong cues. Along with velocity direction, we introduce the confidence\nstate and height state as potential weak cues. With superior performance, our\nmethod still maintains Simple, Online and Real-Time (SORT) characteristics.\nFurthermore, our method shows strong generalization for diverse trackers and\nscenarios in a plug-and-play and training-free manner. Significant and\nconsistent improvements are observed when applying our method to 5 different\nrepresentative trackers. Further, by leveraging both strong and weak cues, our\nmethod Hybrid-SORT achieves superior performance on diverse benchmarks,\nincluding MOT17, MOT20, and especially DanceTrack where interaction and\nocclusion are frequent and severe. The code and models are available at\nhttps://github.com/ymzis69/HybirdSORT.\n","authors":["Mingzhan Yang","Guangxin Han","Bin Yan","Wenhua Zhang","Jinqing Qi","Huchuan Lu","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.00783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14153v2","updated":"2023-08-01T18:31:06Z","published":"2023-06-25T07:40:39Z","title":"DomainStudio: Fine-Tuning Diffusion Models for Domain-Driven Image\n  Generation using Limited Data","summary":"  Denoising diffusion probabilistic models (DDPMs) have been proven capable of\nsynthesizing high-quality images with remarkable diversity when trained on\nlarge amounts of data. Typical diffusion models and modern large-scale\nconditional generative models like text-to-image generative models are\nvulnerable to overfitting when fine-tuned on extremely limited data. Existing\nworks have explored subject-driven generation using a reference set containing\na few images. However, few prior works explore DDPM-based domain-driven\ngeneration, which aims to learn the common features of target domains while\nmaintaining diversity. This paper proposes a novel DomainStudio approach to\nadapt DDPMs pre-trained on large-scale source datasets to target domains using\nlimited data. It is designed to keep the diversity of subjects provided by\nsource domains and get high-quality and diverse adapted samples in target\ndomains. We propose to keep the relative distances between adapted samples to\nachieve considerable generation diversity. In addition, we further enhance the\nlearning of high-frequency details for better generation quality. Our approach\nis compatible with both unconditional and conditional diffusion models. This\nwork makes the first attempt to realize unconditional few-shot image generation\nwith diffusion models, achieving better quality and greater diversity than\ncurrent state-of-the-art GAN-based approaches. Moreover, this work also\nsignificantly relieves overfitting for conditional generation and realizes\nhigh-quality domain-driven generation, further expanding the applicable\nscenarios of modern large-scale text-to-image models.\n","authors":["Jingyuan Zhu","Huimin Ma","Jiansheng Chen","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.14153v2.pdf","comment":"extended from DDPM-PA (arXiv:2211.03264), 33 pages, 34 figures,\n  Update the personalization of DomainStudio"},{"id":"http://arxiv.org/abs/2308.00773v1","updated":"2023-08-01T18:26:55Z","published":"2023-08-01T18:26:55Z","title":"High-Fidelity Eye Animatable Neural Radiance Fields for Human Face","summary":"  Face rendering using neural radiance fields (NeRF) is a rapidly developing\nresearch area in computer vision. While recent methods primarily focus on\ncontrolling facial attributes such as identity and expression, they often\noverlook the crucial aspect of modeling eyeball rotation, which holds\nimportance for various downstream tasks. In this paper, we aim to learn a face\nNeRF model that is sensitive to eye movements from multi-view images. We\naddress two key challenges in eye-aware face NeRF learning: how to effectively\ncapture eyeball rotation for training and how to construct a manifold for\nrepresenting eyeball rotation. To accomplish this, we first fit FLAME, a\nwell-established parametric face model, to the multi-view images considering\nmulti-view consistency. Subsequently, we introduce a new Dynamic Eye-aware NeRF\n(DeNeRF). DeNeRF transforms 3D points from different views into a canonical\nspace to learn a unified face NeRF model. We design an eye deformation field\nfor the transformation, including rigid transformation, e.g., eyeball rotation,\nand non-rigid transformation. Through experiments conducted on the ETH-XGaze\ndataset, we demonstrate that our model is capable of generating high-fidelity\nimages with accurate eyeball rotation and non-rigid periocular deformation,\neven under novel viewing angles. Furthermore, we show that utilizing the\nrendered images can effectively enhance gaze estimation performance.\n","authors":["Hengfei Wang","Zhongqun Zhang","Yihua Cheng","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.00773v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.00759v1","updated":"2023-08-01T18:00:49Z","published":"2023-08-01T18:00:49Z","title":"Decomposition Ascribed Synergistic Learning for Unified Image\n  Restoration","summary":"  Learning to restore multiple image degradations within a single model is\nquite beneficial for real-world applications. Nevertheless, existing works\ntypically concentrate on regarding each degradation independently, while their\nrelationship has been less exploited to ensure the synergistic learning. To\nthis end, we revisit the diverse degradations through the lens of singular\nvalue decomposition, with the observation that the decomposed singular vectors\nand singular values naturally undertake the different types of degradation\ninformation, dividing various restoration tasks into two groups,\\ie, singular\nvector dominated and singular value dominated. The above analysis renders a\nmore unified perspective to ascribe the diverse degradations, compared to\nprevious task-level independent learning. The dedicated optimization of\ndegraded singular vectors and singular values inherently utilizes the potential\nrelationship among diverse restoration tasks, attributing to the Decomposition\nAscribed Synergistic Learning (DASL). Specifically, DASL comprises two\neffective operators, namely, Singular VEctor Operator (SVEO) and Singular VAlue\nOperator (SVAO), to favor the decomposed optimization, which can be lightly\nintegrated into existing convolutional image restoration backbone. Moreover,\nthe congruous decomposition loss has been devised for auxiliary. Extensive\nexperiments on blended five image restoration tasks demonstrate the\neffectiveness of our method, including image deraining, image dehazing, image\ndenoising, image deblurring, and low-light image enhancement.\n","authors":["Jinghao Zhang","Jie Huang","Man Zhou","Chongyi Li","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.00759v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.00755v1","updated":"2023-08-01T18:00:08Z","published":"2023-08-01T18:00:08Z","title":"The Bias Amplification Paradox in Text-to-Image Generation","summary":"  Bias amplification is a phenomenon in which models increase imbalances\npresent in the training data. In this paper, we study bias amplification in the\ntext-to-image domain using Stable Diffusion by comparing gender ratios in\ntraining vs. generated images. We find that the model appears to amplify\ngender-occupation biases found in the training data (LAION). However, we\ndiscover that amplification can largely be attributed to discrepancies between\ntraining captions and model prompts. For example, an inherent difference is\nthat captions from the training data often contain explicit gender information\nwhile the prompts we use do not, which leads to a distribution shift and\nconsequently impacts bias measures. Once we account for various distributional\ndifferences between texts used for training and generation, we observe that\namplification decreases considerably. Our findings illustrate the challenges of\ncomparing biases in models and the data they are trained on, and highlight\nconfounding factors that contribute to bias amplification.\n","authors":["Preethi Seshadri","Sameer Singh","Yanai Elazar"],"pdf_url":"https://arxiv.org/pdf/2308.00755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00729v1","updated":"2023-08-01T16:04:42Z","published":"2023-08-01T16:04:42Z","title":"Ada-DQA: Adaptive Diverse Quality-aware Feature Acquisition for Video\n  Quality Assessment","summary":"  Video quality assessment (VQA) has attracted growing attention in recent\nyears. While the great expense of annotating large-scale VQA datasets has\nbecome the main obstacle for current deep-learning methods. To surmount the\nconstraint of insufficient training data, in this paper, we first consider the\ncomplete range of video distribution diversity (\\ie content, distortion,\nmotion) and employ diverse pretrained models (\\eg architecture, pretext task,\npre-training dataset) to benefit quality representation. An Adaptive Diverse\nQuality-aware feature Acquisition (Ada-DQA) framework is proposed to capture\ndesired quality-related features generated by these frozen pretrained models.\nBy leveraging the Quality-aware Acquisition Module (QAM), the framework is able\nto extract more essential and relevant features to represent quality. Finally,\nthe learned quality representation is utilized as supplementary supervisory\ninformation, along with the supervision of the labeled quality score, to guide\nthe training of a relatively lightweight VQA model in a knowledge distillation\nmanner, which largely reduces the computational cost during inference.\nExperimental results on three mainstream no-reference VQA benchmarks clearly\nshow the superior performance of Ada-DQA in comparison with current\nstate-of-the-art approaches without using extra training data of VQA.\n","authors":["Hongbo Liu","Mingda Wu","Kun Yuan","Ming Sun","Yansong Tang","Chuanchuan Zheng","Xing Wen","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2308.00729v1.pdf","comment":"10 pages, 5 figures, to appear in ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.00728v1","updated":"2023-08-01T15:51:04Z","published":"2023-08-01T15:51:04Z","title":"ELFNet: Evidential Local-global Fusion for Stereo Matching","summary":"  Although existing stereo matching models have achieved continuous\nimprovement, they often face issues related to trustworthiness due to the\nabsence of uncertainty estimation. Additionally, effectively leveraging\nmulti-scale and multi-view knowledge of stereo pairs remains unexplored. In\nthis paper, we introduce the \\textbf{E}vidential \\textbf{L}ocal-global\n\\textbf{F}usion (ELF) framework for stereo matching, which endows both\nuncertainty estimation and confidence-aware fusion with trustworthy heads.\nInstead of predicting the disparity map alone, our model estimates an\nevidential-based disparity considering both aleatoric and epistemic\nuncertainties. With the normal inverse-Gamma distribution as a bridge, the\nproposed framework realizes intra evidential fusion of multi-level predictions\nand inter evidential fusion between cost-volume-based and transformer-based\nstereo matching. Extensive experimental results show that the proposed\nframework exploits multi-view information effectively and achieves\nstate-of-the-art overall performance both on accuracy and cross-domain\ngeneralization.\n  The codes are available at https://github.com/jimmy19991222/ELFNet.\n","authors":["Jieming Lou","Weide Liu","Zhuo Chen","Fayao Liu","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.00728v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00727v1","updated":"2023-08-01T15:37:19Z","published":"2023-08-01T15:37:19Z","title":"Adaptive Semantic Consistency for Cross-domain Few-shot Classification","summary":"  Cross-domain few-shot classification (CD-FSC) aims to identify novel target\nclasses with a few samples, assuming that there exists a domain shift between\nsource and target domains. Existing state-of-the-art practices typically\npre-train on source domain and then finetune on the few-shot target data to\nyield task-adaptive representations. Despite promising progress, these methods\nare prone to overfitting the limited target distribution since data-scarcity\nand ignore the transferable knowledge learned in the source domain. To\nalleviate this problem, we propose a simple plug-and-play Adaptive Semantic\nConsistency (ASC) framework, which improves cross-domain robustness by\npreserving source transfer capability during the finetuning stage. Concretely,\nwe reuse the source images in the pretraining phase and design an adaptive\nweight assignment strategy to highlight the samples similar to target domain,\naiming to aggregate informative target-related knowledge from source domain.\nSubsequently, a semantic consistency regularization is applied to constrain the\nconsistency between the semantic features of the source images output by the\nsource model and target model. In this way, the proposed ASC enables explicit\ntransfer of source domain knowledge to prevent the model from overfitting the\ntarget domain. Extensive experiments on multiple benchmarks demonstrate the\neffectiveness of the proposed ASC, and ASC provides consistent improvements\nover the baselines. The source code will be released.\n","authors":["Hengchu Lu","Yuanjie Shao","Xiang Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2308.00727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00725v1","updated":"2023-08-01T15:12:36Z","published":"2023-08-01T15:12:36Z","title":"Latent-Shift: Gradient of Entropy Helps Neural Codecs","summary":"  End-to-end image/video codecs are getting competitive compared to traditional\ncompression techniques that have been developed through decades of manual\nengineering efforts. These trainable codecs have many advantages over\ntraditional techniques such as easy adaptation on perceptual distortion metrics\nand high performance on specific domains thanks to their learning ability.\nHowever, state of the art neural codecs does not take advantage of the\nexistence of gradient of entropy in decoding device. In this paper, we\ntheoretically show that gradient of entropy (available at decoder side) is\ncorrelated with the gradient of the reconstruction error (which is not\navailable at decoder side). We then demonstrate experimentally that this\ngradient can be used on various compression methods, leading to a $1-2\\%$ rate\nsavings for the same quality. Our method is orthogonal to other improvements\nand brings independent rate savings.\n","authors":["Muhammet Balcilar","Bharath Bhushan Damodaran","Karam Naser","Franck Galpin","Pierre Hellier"],"pdf_url":"https://arxiv.org/pdf/2308.00725v1.pdf","comment":"Published to ICIP2023, 6 pages, 1 figure"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.00682v1","updated":"2023-08-01T17:37:24Z","published":"2023-08-01T17:37:24Z","title":"TimePool: Visually Answer \"Which and When\" Questions On Univariate Time\n  Series","summary":"  When exploring time series datasets, analysts often pose \"which and when\"\nquestions. For example, with world life expectancy data over one hundred years,\nthey may inquire about the top 10 countries in life expectancy and the time\nperiod when they achieved this status, or which countries have had longer life\nexpectancy than Ireland and when. This paper proposes TimePool, a new\nvisualization prototype, to address this need for univariate time series\nanalysis. It allows users to construct interactive \"which and when\" queries and\nvisually explore the results for insights.\n","authors":["Tinghao Feng","Yueqi Hu","Jing Yang","Tom Polk","Ye Zhao","Shixia Liu","Zhaocong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.00682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11224v2","updated":"2023-08-01T13:40:31Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n  Models","summary":"  Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. The models\nexcel in applications such as dense retrieval and semantic textual similarity.\nThis paper details the development of Jina Embeddings, starting with the\ncreation of high-quality pairwise and triplet datasets. It underlines the\ncrucial role of data cleaning in dataset preparation, gives in-depth insights\ninto the model training process, and concludes with a comprehensive performance\nevaluation using the Massive Textual Embedding Benchmark (MTEB). To increase\nthe model's awareness of negations, we constructed a novel training and\nevaluation dataset of negated and non-negated statements, which we make\npublicly available to the community.\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v2.pdf","comment":"9 pages, 2 page appendix"},{"id":"http://arxiv.org/abs/2308.00504v1","updated":"2023-08-01T12:39:42Z","published":"2023-08-01T12:39:42Z","title":"Explainable Graph Spectral Clustering of Text Documents","summary":"  Spectral clustering methods are known for their ability to represent clusters\nof diverse shapes, densities etc. However, results of such algorithms, when\napplied e.g. to text documents, are hard to explain to the user, especially due\nto embedding in the spectral space which has no obvious relation to document\ncontents. Therefore there is an urgent need to elaborate methods for explaining\nthe outcome of the clustering. This paper presents a contribution towards this\ngoal. We present a proposal of explanation of results of combinatorial\nLaplacian based graph spectral clustering. It is based on showing (approximate)\nequivalence of combinatorial Laplacian embedding, $K$-embedding (proposed in\nthis paper) and term vector space embedding. Hence a bridge is constructed\nbetween the textual contents and the clustering results. We provide theoretical\nbackground for this approach. We performed experimental study showing that\n$K$-embedding approximates well Laplacian embedding under favourable block\nmatrix conditions and show that approximation is good enough under other\nconditions.\n","authors":["Bartłomiej Starosta","Mieczysław A. Kłopotek","Sławomir T. Wierzchoń"],"pdf_url":"https://arxiv.org/pdf/2308.00504v1.pdf","comment":"4 figures, 15 tables"},{"id":"http://arxiv.org/abs/2302.00083v3","updated":"2023-08-01T12:10:15Z","published":"2023-01-31T20:26:16Z","title":"In-Context Retrieval-Augmented Language Models","summary":"  Retrieval-Augmented Language Modeling (RALM) methods, which condition a\nlanguage model (LM) on relevant documents from a grounding corpus during\ngeneration, were shown to significantly improve language modeling performance.\nIn addition, they can mitigate the problem of factually inaccurate text\ngeneration and provide natural source attribution mechanism. Existing RALM\napproaches focus on modifying the LM architecture in order to facilitate the\nincorporation of external information, significantly complicating deployment.\nThis paper considers a simple alternative, which we dub In-Context RALM:\nleaving the LM architecture unchanged and prepending grounding documents to the\ninput, without any further training of the LM. We show that In-Context RALM\nthat builds on off-the-shelf general purpose retrievers provides surprisingly\nlarge LM gains across model sizes and diverse corpora. We also demonstrate that\nthe document retrieval and ranking mechanism can be specialized to the RALM\nsetting to further boost performance. We conclude that In-Context RALM has\nconsiderable potential to increase the prevalence of LM grounding, particularly\nin settings where a pretrained LM must be used without modification or even via\nAPI access.\n","authors":["Ori Ram","Yoav Levine","Itay Dalmedigos","Dor Muhlgay","Amnon Shashua","Kevin Leyton-Brown","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2302.00083v3.pdf","comment":"Accepted for publication in Transactions of the Association for\n  Computational Linguistics (TACL). pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2308.00480v1","updated":"2023-08-01T12:07:08Z","published":"2023-08-01T12:07:08Z","title":"On the Effects of Regional Spelling Conventions in Retrieval Models","summary":"  One advantage of neural ranking models is that they are meant to generalise\nwell in situations of synonymity i.e. where two words have similar or identical\nmeanings. In this paper, we investigate and quantify how well various ranking\nmodels perform in a clear-cut case of synonymity: when words are simply\nexpressed in different surface forms due to regional differences in spelling\nconventions (e.g., color vs colour). We first explore the prevalence of\nAmerican and British English spelling conventions in datasets used for the\npre-training, training and evaluation of neural retrieval methods, and find\nthat American spelling conventions are far more prevalent. Despite these biases\nin the training data, we find that retrieval models often generalise well in\nthis case of synonymity. We explore the effect of document spelling\nnormalisation in retrieval and observe that all models are affected by\nnormalising the document's spelling. While they all experience a drop in\nperformance when normalised to a different spelling convention than that of the\nquery, we observe varied behaviour when the document is normalised to share the\nquery spelling convention: lexical models show improvements, dense retrievers\nremain unaffected, and re-rankers exhibit contradictory behaviour.\n","authors":["Andreas Chari","Sean MacAvaney","Iadh Ounis"],"pdf_url":"https://arxiv.org/pdf/2308.00480v1.pdf","comment":"10 pages, 3 tables, short paper published in SIGIR '23"},{"id":"http://arxiv.org/abs/2308.00415v1","updated":"2023-08-01T09:51:35Z","published":"2023-08-01T09:51:35Z","title":"Generative Query Reformulation for Effective Adhoc Search","summary":"  Performing automatic reformulations of a user's query is a popular paradigm\nused in information retrieval (IR) for improving effectiveness -- as\nexemplified by the pseudo-relevance feedback approaches, which expand the query\nin order to alleviate the vocabulary mismatch problem. Recent advancements in\ngenerative language models have demonstrated their ability in generating\nresponses that are relevant to a given prompt. In light of this success, we\nseek to study the capacity of such models to perform query reformulation and\nhow they compare with long-standing query reformulation methods that use\npseudo-relevance feedback. In particular, we investigate two representative\nquery reformulation frameworks, GenQR and GenPRF. GenQR directly reformulates\nthe user's input query, while GenPRF provides additional context for the query\nby making use of pseudo-relevance feedback information. For each reformulation\nmethod, we leverage different techniques, including fine-tuning and direct\nprompting, to harness the knowledge of language models. The reformulated\nqueries produced by the generative models are demonstrated to markedly benefit\nthe effectiveness of a state-of-the-art retrieval pipeline on four TREC test\ncollections (varying from TREC 2004 Robust to the TREC 2019 Deep Learning).\nFurthermore, our results indicate that our studied generative models can\noutperform various statistical query expansion approaches while remaining\ncomparable to other existing complex neural query reformulation models, with\nthe added benefit of being simpler to implement.\n","authors":["Xiao Wang","Sean MacAvaney","Craig Macdonald","Iadh Ounis"],"pdf_url":"https://arxiv.org/pdf/2308.00415v1.pdf","comment":"Accepted to Gen-IR@SIGIR2023 Workshop"},{"id":"http://arxiv.org/abs/2308.00404v1","updated":"2023-08-01T09:31:44Z","published":"2023-08-01T09:31:44Z","title":"Challenging the Myth of Graph Collaborative Filtering: a Reasoned and\n  Reproducibility-driven Analysis","summary":"  The success of graph neural network-based models (GNNs) has significantly\nadvanced recommender systems by effectively modeling users and items as a\nbipartite, undirected graph. However, many original graph-based works often\nadopt results from baseline papers without verifying their validity for the\nspecific configuration under analysis. Our work addresses this issue by\nfocusing on the replicability of results. We present a code that successfully\nreplicates results from six popular and recent graph recommendation models\n(NGCF, DGCF, LightGCN, SGL, UltraGCN, and GFCF) on three common benchmark\ndatasets (Gowalla, Yelp 2018, and Amazon Book). Additionally, we compare these\ngraph models with traditional collaborative filtering models that historically\nperformed well in offline evaluations. Furthermore, we extend our study to two\nnew datasets (Allrecipes and BookCrossing) that lack established setups in\nexisting literature. As the performance on these datasets differs from the\nprevious benchmarks, we analyze the impact of specific dataset characteristics\non recommendation accuracy. By investigating the information flow from users'\nneighborhoods, we aim to identify which models are influenced by intrinsic\nfeatures in the dataset structure. The code to reproduce our experiments is\navailable at: https://github.com/sisinflab/Graph-RSs-Reproducibility.\n","authors":["Vito Walter Anelli","Daniele Malitesta","Claudio Pomo","Alejandro Bellogín","Tommaso Di Noia","Eugenio Di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2308.00404v1.pdf","comment":"Accepted to RecSys '23 - Reproducility Track"},{"id":"http://arxiv.org/abs/2308.00762v1","updated":"2023-08-01T18:01:21Z","published":"2023-08-01T18:01:21Z","title":"Self-Supervised Contrastive BERT Fine-tuning for Fusion-based\n  Reviewed-Item Retrieval","summary":"  As natural language interfaces enable users to express increasingly complex\nnatural language queries, there is a parallel explosion of user review content\nthat can allow users to better find items such as restaurants, books, or movies\nthat match these expressive queries. While Neural Information Retrieval (IR)\nmethods have provided state-of-the-art results for matching queries to\ndocuments, they have not been extended to the task of Reviewed-Item Retrieval\n(RIR), where query-review scores must be aggregated (or fused) into item-level\nscores for ranking. In the absence of labeled RIR datasets, we extend Neural IR\nmethodology to RIR by leveraging self-supervised methods for contrastive\nlearning of BERT embeddings for both queries and reviews. Specifically,\ncontrastive learning requires a choice of positive and negative samples, where\nthe unique two-level structure of our item-review data combined with meta-data\naffords us a rich structure for the selection of these samples. For contrastive\nlearning in a Late Fusion scenario, we investigate the use of positive review\nsamples from the same item and/or with the same rating, selection of hard\npositive samples by choosing the least similar reviews from the same anchor\nitem, and selection of hard negative samples by choosing the most similar\nreviews from different items. We also explore anchor sub-sampling and\naugmenting with meta-data. For a more end-to-end Early Fusion approach, we\nintroduce contrastive item embedding learning to fuse reviews into single item\nembeddings. Experimental results show that Late Fusion contrastive learning for\nNeural RIR outperforms all other contrastive IR configurations, Neural IR, and\nsparse retrieval baselines, thus demonstrating the power of exploiting the\ntwo-level structure in Neural RIR approaches as well as the importance of\npreserving the nuance of individual review content via Late Fusion methods.\n","authors":["Mohammad Mahdi Abdollah Pour","Parsa Farinneya","Armin Toroghi","Anton Korikov","Ali Pesaranghader","Touqir Sajed","Manasa Bharadwaj","Borislav Mavrin","Scott Sanner"],"pdf_url":"https://arxiv.org/pdf/2308.00762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00735v1","updated":"2023-08-01T17:34:30Z","published":"2023-08-01T17:34:30Z","title":"A Knowledge-Oriented Approach to Enhance Integration and Communicability\n  in the Polkadot Ecosystem","summary":"  The Polkadot ecosystem is a disruptive and highly complex multi-chain\narchitecture that poses challenges in terms of data analysis and\ncommunicability. Currently, there is a lack of standardized and holistic\napproaches to retrieve and analyze data across parachains and applications,\nmaking it difficult for general users and developers to access ecosystem data\nconsistently. This paper proposes a conceptual framework that includes a domain\nontology called POnto (a Polkadot Ontology) to address these challenges. POnto\nprovides a structured representation of the ecosystem's concepts and\nrelationships, enabling a formal understanding of the platform. The proposed\nknowledge-oriented approach enhances integration and communicability, enabling\na wider range of users to participate in the ecosystem and facilitating the\ndevelopment of AI-based applications. The paper presents a case study\nmethodology to validate the proposed framework, which includes expert feedback\nand insights from the Polkadot community. The POnto ontology and the roadmap\nfor a query engine based on a Controlled Natural Language using the ontology,\nprovide valuable contributions to the growth and adoption of the Polkadot\necosystem in heterogeneous socio-technical environments.\n","authors":["Marcio Ferreira Moreno","Rafael Rossi de Mello Brandão"],"pdf_url":"https://arxiv.org/pdf/2308.00735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01208v1","updated":"2023-08-01T14:27:54Z","published":"2023-08-01T14:27:54Z","title":"Adaptive Collaborative Filtering with Personalized Time Decay Functions\n  for Financial Product Recommendation","summary":"  Classical recommender systems often assume that historical data are\nstationary and fail to account for the dynamic nature of user preferences,\nlimiting their ability to provide reliable recommendations in time-sensitive\nsettings. This assumption is particularly problematic in finance, where\nfinancial products exhibit continuous changes in valuations, leading to\nfrequent shifts in client interests. These evolving interests, summarized in\nthe past client-product interactions, see their utility fade over time with a\ndegree that might differ from one client to another. To address this challenge,\nwe propose a time-dependent collaborative filtering algorithm that can\nadaptively discount distant client-product interactions using personalized\ndecay functions. Our approach is designed to handle the non-stationarity of\nfinancial data and produce reliable recommendations by modeling the dynamic\ncollaborative signals between clients and products. We evaluate our method\nusing a proprietary dataset from BNP Paribas and demonstrate significant\nimprovements over state-of-the-art benchmarks from relevant literature. Our\nfindings emphasize the importance of incorporating time explicitly in the model\nto enhance the accuracy of financial product recommendation.\n","authors":["Ashraf Ghiye","Baptiste Barreau","Laurent Carlier","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2308.01208v1.pdf","comment":"10 pages, 1 figure, 2 tables, to be published in the Seventeenth ACM\n  Conference on Recommender Systems (RecSys '23)"},{"id":"http://arxiv.org/abs/2308.02542v1","updated":"2023-08-01T15:14:23Z","published":"2023-08-01T15:14:23Z","title":"Collaborative filtering to capture AI user's preferences as norms","summary":"  Customising AI technologies to each user's preferences is fundamental to them\nfunctioning well. Unfortunately, current methods require too much user\ninvolvement and fail to capture their true preferences. In fact, to avoid the\nnuisance of manually setting preferences, users usually accept the default\nsettings even if these do not conform to their true preferences. Norms can be\nuseful to regulate behaviour and ensure it adheres to user preferences but,\nwhile the literature has thoroughly studied norms, most proposals take a formal\nperspective. Indeed, while there has been some research on constructing norms\nto capture a user's privacy preferences, these methods rely on domain knowledge\nwhich, in the case of AI technologies, is difficult to obtain and maintain. We\nargue that a new perspective is required when constructing norms, which is to\nexploit the large amount of preference information readily available from whole\nsystems of users. Inspired by recommender systems, we believe that\ncollaborative filtering can offer a suitable approach to identifying a user's\nnorm preferences without excessive user involvement.\n","authors":["Serramia Marc","Criado Natalia","Luck Michael"],"pdf_url":"https://arxiv.org/pdf/2308.02542v1.pdf","comment":"Accepted at The 24th International Conference on Principles and\n  Practice of Multi-Agent Systems (PRIMA 2022)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2303.00848v5","updated":"2023-08-01T17:57:55Z","published":"2023-03-01T22:36:05Z","title":"Variational Diffusion Models 2.0: Understanding Diffusion Model\n  Objectives as the ELBO with Simple Data Augmentation","summary":"  To achieve the highest perceptual quality, state-of-the-art diffusion models\nare optimized with objectives that look very different from the maximum\nlikelihood and the Evidence Lower Bound (ELBO) objectives. In this work, we\nreveal that diffusion model objectives are actually closely related to the\nELBO.\n  Specifically, we show that all commonly used diffusion model objectives\nequate to a weighted integral of ELBOs over different noise levels, where the\nweighting depends on the specific objective used. Under the condition of\nmonotonic weighting, the connection is even closer: the diffusion objective\nthen equals the ELBO, combined with simple data augmentation, namely Gaussian\nnoise perturbation. We show that this condition holds for a number of\nstate-of-the-art diffusion models.\n  In experiments, we explore new monotonic weightings and demonstrate their\neffectiveness, achieving state-of-the-art FID scores on the high-resolution\nImageNet benchmark.\n","authors":["Diederik P. Kingma","Ruiqi Gao"],"pdf_url":"https://arxiv.org/pdf/2303.00848v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00685v1","updated":"2023-08-01T17:42:35Z","published":"2023-08-01T17:42:35Z","title":"Learning from Hypervectors: A Survey on Hypervector Encoding","summary":"  Hyperdimensional computing (HDC) is an emerging computing paradigm that\nimitates the brain's structure to offer a powerful and efficient processing and\nlearning model. In HDC, the data are encoded with long vectors, called\nhypervectors, typically with a length of 1K to 10K. The literature provides\nseveral encoding techniques to generate orthogonal or correlated hypervectors,\ndepending on the intended application. The existing surveys in the literature\noften focus on the overall aspects of HDC systems, including system inputs,\nprimary computations, and final outputs. However, this study takes a more\nspecific approach. It zeroes in on the HDC system input and the generation of\nhypervectors, directly influencing the hypervector encoding process. This\nsurvey brings together various methods for hypervector generation from\ndifferent studies and explores the limitations, challenges, and potential\nbenefits they entail. Through a comprehensive exploration of this survey,\nreaders will acquire a profound understanding of various encoding types in HDC\nand gain insights into the intricate process of hypervector generation for\ndiverse applications.\n","authors":["Sercan Aygun","Mehran Shoushtari Moghadam","M. Hassan Najafi","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2308.00685v1.pdf","comment":"14 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.00683v1","updated":"2023-08-01T17:40:48Z","published":"2023-08-01T17:40:48Z","title":"CodeBPE: Investigating Subtokenization Options for Large Language Model\n  Pretraining on Source Code","summary":"  Recent works have widely adopted large language model pretraining for source\ncode, suggested source code-specific pretraining objectives and investigated\nthe applicability of various Transformer-based language model architectures for\nsource code. This work investigates another important aspect of such models,\nnamely the effect of different subtokenization options, and aims at identifying\nmost effective and length-efficient subtokenizations, taking into account code\nspecifics. We propose subtokenziation that reduces average length by 17%\nwithout downstream performance drop, and show that a carefully chosen\nsubtokenization may improve quality by 0.5-2%, possibly with some length\nincrease.\n","authors":["Nadezhda Chirkova","Sergey Troshin"],"pdf_url":"https://arxiv.org/pdf/2308.00683v1.pdf","comment":"Published at ICLR 2023"},{"id":"http://arxiv.org/abs/2305.17372v2","updated":"2023-08-01T17:33:40Z","published":"2023-05-27T05:32:30Z","title":"Reinforcement Learning With Reward Machines in Stochastic Games","summary":"  We investigate multi-agent reinforcement learning for stochastic games with\ncomplex tasks, where the reward functions are non-Markovian. We utilize reward\nmachines to incorporate high-level knowledge of complex tasks. We develop an\nalgorithm called Q-learning with reward machines for stochastic games (QRM-SG),\nto learn the best-response strategy at Nash equilibrium for each agent. In\nQRM-SG, we define the Q-function at a Nash equilibrium in augmented state\nspace. The augmented state space integrates the state of the stochastic game\nand the state of reward machines. Each agent learns the Q-functions of all\nagents in the system. We prove that Q-functions learned in QRM-SG converge to\nthe Q-functions at a Nash equilibrium if the stage game at each time step\nduring learning has a global optimum point or a saddle point, and the agents\nupdate Q-functions based on the best-response strategy at this point. We use\nthe Lemke-Howson method to derive the best-response strategy given current\nQ-functions. The three case studies show that QRM-SG can learn the\nbest-response strategies effectively. QRM-SG learns the best-response\nstrategies after around 7500 episodes in Case Study I, 1000 episodes in Case\nStudy II, and 1500 episodes in Case Study III, while baseline methods such as\nNash Q-learning and MADDPG fail to converge to the Nash equilibrium in all\nthree case studies.\n","authors":["Jueming Hu","Jean-Raphael Gaglione","Yanze Wang","Zhe Xu","Ufuk Topcu","Yongming Liu"],"pdf_url":"https://arxiv.org/pdf/2305.17372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00675v1","updated":"2023-08-01T17:21:38Z","published":"2023-08-01T17:21:38Z","title":"Tool Documentation Enables Zero-Shot Tool-Usage with Large Language\n  Models","summary":"  Today, large language models (LLMs) are taught to use new tools by providing\na few demonstrations of the tool's usage. Unfortunately, demonstrations are\nhard to acquire, and can result in undesirable biased usage if the wrong\ndemonstration is chosen. Even in the rare scenario that demonstrations are\nreadily available, there is no principled selection protocol to determine how\nmany and which ones to provide. As tasks grow more complex, the selection\nsearch grows combinatorially and invariably becomes intractable. Our work\nprovides an alternative to demonstrations: tool documentation. We advocate the\nuse of tool documentation, descriptions for the individual tool usage, over\ndemonstrations. We substantiate our claim through three main empirical findings\non 6 tasks across both vision and language modalities. First, on existing\nbenchmarks, zero-shot prompts with only tool documentation are sufficient for\neliciting proper tool usage, achieving performance on par with few-shot\nprompts. Second, on a newly collected realistic tool-use dataset with hundreds\nof available tool APIs, we show that tool documentation is significantly more\nvaluable than demonstrations, with zero-shot documentation significantly\noutperforming few-shot without documentation. Third, we highlight the benefits\nof tool documentations by tackling image generation and video tracking using\njust-released unseen state-of-the-art models as tools. Finally, we highlight\nthe possibility of using tool documentation to automatically enable new\napplications: by using nothing more than the documentation of GroundingDino,\nStable Diffusion, XMem, and SAM, LLMs can re-invent the functionalities of the\njust-released Grounded-SAM and Track Anything models.\n","authors":["Cheng-Yu Hsieh","Si-An Chen","Chun-Liang Li","Yasuhisa Fujii","Alexander Ratner","Chen-Yu Lee","Ranjay Krishna","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2308.00675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09309v4","updated":"2023-08-01T17:20:28Z","published":"2022-10-18T00:55:37Z","title":"RibSeg v2: A Large-scale Benchmark for Rib Labeling and Anatomical\n  Centerline Extraction","summary":"  Automatic rib labeling and anatomical centerline extraction are common\nprerequisites for various clinical applications. Prior studies either use\nin-house datasets that are inaccessible to communities, or focus on rib\nsegmentation that neglects the clinical significance of rib labeling. To\naddress these issues, we extend our prior dataset (RibSeg) on the binary rib\nsegmentation task to a comprehensive benchmark, named RibSeg v2, with 660 CT\nscans (15,466 individual ribs in total) and annotations manually inspected by\nexperts for rib labeling and anatomical centerline extraction. Based on the\nRibSeg v2, we develop a pipeline including deep learning-based methods for rib\nlabeling, and a skeletonization-based method for centerline extraction. To\nimprove computational efficiency, we propose a sparse point cloud\nrepresentation of CT scans and compare it with standard dense voxel grids.\nMoreover, we design and analyze evaluation metrics to address the key\nchallenges of each task. Our dataset, code, and model are available online to\nfacilitate open research at https://github.com/M3DV/RibSeg\n","authors":["Liang Jin","Shixuan Gu","Donglai Wei","Jason Ken Adhinarta","Kaiming Kuang","Yongjie Jessica Zhang","Hanspeter Pfister","Bingbing Ni","Jiancheng Yang","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2210.09309v4.pdf","comment":"10 pages, 6 figures, journal"},{"id":"http://arxiv.org/abs/2307.16499v2","updated":"2023-08-01T16:54:23Z","published":"2023-07-31T08:49:11Z","title":"Learning Generalizable Tool Use with Non-rigid Grasp-pose Registration","summary":"  Tool use, a hallmark feature of human intelligence, remains a challenging\nproblem in robotics due the complex contacts and high-dimensional action space.\nIn this work, we present a novel method to enable reinforcement learning of\ntool use behaviors. Our approach provides a scalable way to learn the operation\nof tools in a new category using only a single demonstration. To this end, we\npropose a new method for generalizing grasping configurations of multi-fingered\nrobotic hands to novel objects. This is used to guide the policy search via\nfavorable initializations and a shaped reward signal. The learned policies\nsolve complex tool use tasks and generalize to unseen tools at test time.\nVisualizations and videos of the trained policies are available at\nhttps://maltemosbach.github.io/generalizable_tool_use.\n","authors":["Malte Mosbach","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2307.16499v2.pdf","comment":"Accepted for publication at IEEE CASE 2023"},{"id":"http://arxiv.org/abs/2210.16708v2","updated":"2023-08-01T16:38:44Z","published":"2022-10-29T23:05:39Z","title":"Data-driven low-dimensional dynamic model of Kolmogorov flow","summary":"  Reduced order models (ROMs) that capture flow dynamics are of interest for\ndecreasing computational costs for simulation as well as for model-based\ncontrol approaches. This work presents a data-driven framework for\nminimal-dimensional models that effectively capture the dynamics and properties\nof the flow. We apply this to Kolmogorov flow in a regime consisting of chaotic\nand intermittent behavior, which is common in many flows processes and is\nchallenging to model. The trajectory of the flow travels near relative periodic\norbits (RPOs), interspersed with sporadic bursting events corresponding to\nexcursions between the regions containing the RPOs. The first step in\ndevelopment of the models is use of an undercomplete autoencoder to map from\nthe full state data down to a latent space of dramatically lower dimension.\nThen models of the discrete-time evolution of the dynamics in the latent space\nare developed. By analyzing the model performance as a function of latent space\ndimension we can estimate the minimum number of dimensions required to capture\nthe system dynamics. To further reduce the dimension of the dynamical model, we\nfactor out a phase variable in the direction of translational invariance for\nthe flow, leading to separate evolution equations for the pattern and phase. At\na model dimension of five for the pattern dynamics, as opposed to the full\nstate dimension of 1024 (i.e. a 32x32 grid), accurate predictions are found for\nindividual trajectories out to about two Lyapunov times, as well as for\nlong-time statistics. Further small improvements in the results occur at a\ndimension of nine. The nearly heteroclinic connections between the different\nRPOs, including the quiescent and bursting time scales, are well captured. We\nalso capture key features of the phase dynamics. Finally, we use the\nlow-dimensional representation to predict future bursting events, finding good\nsuccess.\n","authors":["Carlos E. Pérez De Jesús","Michael D. Graham"],"pdf_url":"https://arxiv.org/pdf/2210.16708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13137v2","updated":"2023-08-01T16:30:48Z","published":"2023-03-23T09:38:52Z","title":"FedGH: Heterogeneous Federated Learning with Generalized Global Header","summary":"  Federated learning (FL) is an emerging machine learning paradigm that allows\nmultiple parties to train a shared model collaboratively in a\nprivacy-preserving manner. Existing horizontal FL methods generally assume that\nthe FL server and clients hold the same model structure. However, due to system\nheterogeneity and the need for personalization, enabling clients to hold models\nwith diverse structures has become an important direction. Existing\nmodel-heterogeneous FL approaches often require publicly available datasets and\nincur high communication and/or computational costs, which limit their\nperformances. To address these limitations, we propose a simple but effective\nFederated Global prediction Header (FedGH) approach. It is a communication and\ncomputation-efficient model-heterogeneous FL framework which trains a shared\ngeneralized global prediction header with representations extracted by\nheterogeneous extractors for clients' models at the FL server. The trained\ngeneralized global prediction header learns from different clients. The\nacquired global knowledge is then transferred to clients to substitute each\nclient's local prediction header. We derive the non-convex convergence rate of\nFedGH. Extensive experiments on two real-world datasets demonstrate that FedGH\nachieves significantly more advantageous performance in both model-homogeneous\nand -heterogeneous FL scenarios compared to seven state-of-the-art personalized\nFL models, beating the best-performing baseline by up to 8.87% (for\nmodel-homogeneous FL) and 1.83% (for model-heterogeneous FL) in terms of\naverage test accuracy, while saving up to 85.53% of communication overhead.\n","authors":["Liping Yi","Gang Wang","Xiaoguang Liu","Zhuan Shi","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2303.13137v2.pdf","comment":"11 pages, 5 figures,accepted by Proceedings of the 31st ACM\n  International Conference on Multimedia (MM 2023)"},{"id":"http://arxiv.org/abs/2111.07799v3","updated":"2023-08-01T16:05:00Z","published":"2021-11-15T14:33:06Z","title":"Spectral learning of multivariate extremes","summary":"  We propose a spectral clustering algorithm for analyzing the dependence\nstructure of multivariate extremes. More specifically, we focus on the\nasymptotic dependence of multivariate extremes characterized by the angular or\nspectral measure in extreme value theory. Our work studies the theoretical\nperformance of spectral clustering based on a random $k$-nearest neighbor graph\nconstructed from an extremal sample, i.e., the angular part of random vectors\nfor which the radius exceeds a large threshold. In particular, we derive the\nasymptotic distribution of extremes arising from a linear factor model and\nprove that, under certain conditions, spectral clustering can consistently\nidentify the clusters of extremes arising in this model. Leveraging this result\nwe propose a simple consistent estimation strategy for learning the angular\nmeasure. Our theoretical findings are complemented with numerical experiments\nillustrating the finite sample performance of our methods.\n","authors":["Marco Avella Medina","Richard A. Davis","Gennady Samorodnitsky"],"pdf_url":"https://arxiv.org/pdf/2111.07799v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00629v1","updated":"2023-08-01T15:56:24Z","published":"2023-08-01T15:56:24Z","title":"Hessian-Aware Bayesian Optimization for Decision Making Systems","summary":"  Many approaches for optimizing decision making systems rely on gradient based\nmethods requiring informative feedback from the environment. However, in the\ncase where such feedback is sparse or uninformative, such approaches may result\nin poor performance. Derivative-free approaches such as Bayesian Optimization\nmitigate the dependency on the quality of gradient feedback, but are known to\nscale poorly in the high-dimension setting of complex decision making systems.\nThis problem is exacerbated if the system requires interactions between several\nactors cooperating to accomplish a shared goal. To address the dimensionality\nchallenge, we propose a compact multi-layered architecture modeling the\ndynamics of actor interactions through the concept of role. Additionally, we\nintroduce Hessian-aware Bayesian Optimization to efficiently optimize the\nmulti-layered architecture parameterized by a large number of parameters.\nExperimental results demonstrate that our method (HA-GP-UCB) works effectively\non several benchmarks under resource constraints and malformed feedback\nsettings.\n","authors":["Mohit Rajpal","Lac Gia Tran","Yehong Zhang","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2308.00629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00628v1","updated":"2023-08-01T15:55:41Z","published":"2023-08-01T15:55:41Z","title":"Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation\n  in Outdoor Scenes","summary":"  3D human pose estimation in outdoor environments has garnered increasing\nattention recently. However, prevalent 3D human pose datasets pertaining to\noutdoor scenes lack diversity, as they predominantly utilize only one type of\nmodality (RGB image or pointcloud), and often feature only one individual\nwithin each scene. This limited scope of dataset infrastructure considerably\nhinders the variability of available data. In this article, we propose\nHuman-M3, an outdoor multi-modal multi-view multi-person human pose database\nwhich includes not only multi-view RGB videos of outdoor scenes but also\ncorresponding pointclouds. In order to obtain accurate human poses, we propose\nan algorithm based on multi-modal data input to generate ground truth\nannotation. This benefits from robust pointcloud detection and tracking, which\nsolves the problem of inaccurate human localization and matching ambiguity that\nmay exist in previous multi-view RGB videos in outdoor multi-person scenes, and\ngenerates reliable ground truth annotations. Evaluation of multiple different\nmodalities algorithms has shown that this database is challenging and suitable\nfor future research. Furthermore, we propose a 3D human pose estimation\nalgorithm based on multi-modal data input, which demonstrates the advantages of\nmulti-modal data input for 3D human pose estimation. Code and data will be\nreleased on https://github.com/soullessrobot/Human-M3-Dataset.\n","authors":["Bohao Fan","Siqi Wang","Wenzhao Zheng","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.00628v1.pdf","comment":"Code and data will be released on\n  https://github.com/soullessrobot/Human-M3-Dataset"},{"id":"http://arxiv.org/abs/2210.11950v2","updated":"2023-08-01T15:41:09Z","published":"2022-10-21T13:19:45Z","title":"Learning Graphical Factor Models with Riemannian Optimization","summary":"  Graphical models and factor analysis are well-established tools in\nmultivariate statistics. While these models can be both linked to structures\nexhibited by covariance and precision matrices, they are generally not jointly\nleveraged within graph learning processes. This paper therefore addresses this\nissue by proposing a flexible algorithmic framework for graph learning under\nlow-rank structural constraints on the covariance matrix. The problem is\nexpressed as penalized maximum likelihood estimation of an elliptical\ndistribution (a generalization of Gaussian graphical models to possibly\nheavy-tailed distributions), where the covariance matrix is optionally\nconstrained to be structured as low-rank plus diagonal (low-rank factor model).\nThe resolution of this class of problems is then tackled with Riemannian\noptimization, where we leverage geometries of positive definite matrices and\npositive semi-definite matrices of fixed rank that are well suited to\nelliptical models. Numerical experiments on real-world data sets illustrate the\neffectiveness of the proposed approach.\n","authors":["Alexandre Hippert-Ferrer","Florent Bouchard","Ammar Mian","Titouan Vayer","Arnaud Breloy"],"pdf_url":"https://arxiv.org/pdf/2210.11950v2.pdf","comment":"30 pages, 11 figures, 1 table, accepted at ECML PKDD 2023"},{"id":"http://arxiv.org/abs/2210.08973v2","updated":"2023-08-01T15:40:07Z","published":"2022-09-30T22:05:46Z","title":"FAIR for AI: An interdisciplinary and international community building\n  perspective","summary":"  A foundational set of findable, accessible, interoperable, and reusable\n(FAIR) principles were proposed in 2016 as prerequisites for proper data\nmanagement and stewardship, with the goal of enabling the reusability of\nscholarly data. The principles were also meant to apply to other digital\nassets, at a high level, and over time, the FAIR guiding principles have been\nre-interpreted or extended to include the software, tools, algorithms, and\nworkflows that produce data. FAIR principles are now being adapted in the\ncontext of AI models and datasets. Here, we present the perspectives, vision,\nand experiences of researchers from different countries, disciplines, and\nbackgrounds who are leading the definition and adoption of FAIR principles in\ntheir communities of practice, and discuss outcomes that may result from\npursuing and incentivizing FAIR AI research. The material for this report\nbuilds on the FAIR for AI Workshop held at Argonne National Laboratory on June\n7, 2022.\n","authors":["E. A. Huerta","Ben Blaiszik","L. Catherine Brinson","Kristofer E. Bouchard","Daniel Diaz","Caterina Doglioni","Javier M. Duarte","Murali Emani","Ian Foster","Geoffrey Fox","Philip Harris","Lukas Heinrich","Shantenu Jha","Daniel S. Katz","Volodymyr Kindratenko","Christine R. Kirkpatrick","Kati Lassila-Perini","Ravi K. Madduri","Mark S. Neubauer","Fotis E. Psomopoulos","Avik Roy","Oliver Rübel","Zhizhen Zhao","Ruike Zhu"],"pdf_url":"https://arxiv.org/pdf/2210.08973v2.pdf","comment":"10 pages, comments welcome!; v2: 12 pages, accepted to Scientific\n  Data"},{"id":"http://arxiv.org/abs/2308.00607v1","updated":"2023-08-01T15:34:02Z","published":"2023-08-01T15:34:02Z","title":"Beyond One-Hot-Encoding: Injecting Semantics to Drive Image Classifiers","summary":"  Images are loaded with semantic information that pertains to real-world\nontologies: dog breeds share mammalian similarities, food pictures are often\ndepicted in domestic environments, and so on. However, when training machine\nlearning models for image classification, the relative similarities amongst\nobject classes are commonly paired with one-hot-encoded labels. According to\nthis logic, if an image is labelled as 'spoon', then 'tea-spoon' and 'shark'\nare equally wrong in terms of training loss. To overcome this limitation, we\nexplore the integration of additional goals that reflect ontological and\nsemantic knowledge, improving model interpretability and trustworthiness. We\nsuggest a generic approach that allows to derive an additional loss term\nstarting from any kind of semantic information about the classification label.\nFirst, we show how to apply our approach to ontologies and word embeddings, and\ndiscuss how the resulting information can drive a supervised learning process.\nSecond, we use our semantically enriched loss to train image classifiers, and\nanalyse the trade-offs between accuracy, mistake severity, and learned internal\nrepresentations. Finally, we discuss how this approach can be further exploited\nin terms of explainability and adversarial robustness. Code repository:\nhttps://github.com/S1M0N38/semantic-encodings\n","authors":["Alan Perotti","Simone Bertolotto","Eliana Pastor","André Panisson"],"pdf_url":"https://arxiv.org/pdf/2308.00607v1.pdf","comment":"This work has been accepted to be presented to The 1st World\n  Conference on eXplainable Artificial Intelligence (xAI 2023), July 26-28,\n  2023 - Lisboa, Portugal"},{"id":"http://arxiv.org/abs/2303.09901v3","updated":"2023-08-01T15:16:52Z","published":"2023-03-17T11:33:06Z","title":"mCPT at SemEval-2023 Task 3: Multilingual Label-Aware Contrastive\n  Pre-Training of Transformers for Few- and Zero-shot Framing Detection","summary":"  This paper presents the winning system for the zero-shot Spanish framing\ndetection task, which also achieves competitive places in eight additional\nlanguages. The challenge of the framing detection task lies in identifying a\nset of 14 frames when only a few or zero samples are available, i.e., a\nmultilingual multi-label few- or zero-shot setting. Our developed solution\nemploys a pre-training procedure based on multilingual Transformers using a\nlabel-aware contrastive loss function. In addition to describing the system, we\nperform an embedding space analysis and ablation study to demonstrate how our\npre-training procedure supports framing detection to advance computational\nframing analysis.\n","authors":["Markus Reiter-Haas","Alexander Ertl","Kevin Innerebner","Elisabeth Lex"],"pdf_url":"https://arxiv.org/pdf/2303.09901v3.pdf","comment":"Presented at SemEval'23"},{"id":"http://arxiv.org/abs/2308.00583v1","updated":"2023-08-01T15:00:14Z","published":"2023-08-01T15:00:14Z","title":"Semisupervised Anomaly Detection using Support Vector Regression with\n  Quantum Kernel","summary":"  Anomaly detection (AD) involves identifying observations or events that\ndeviate in some way from the rest of the data. Machine learning techniques have\nshown success in automating this process by detecting hidden patterns and\ndeviations in large-scale data. The potential of quantum computing for machine\nlearning has been widely recognized, leading to extensive research efforts to\ndevelop suitable quantum machine learning (QML) algorithms. In particular, the\nsearch for QML algorithms for near-term NISQ devices is in full swing. However,\nNISQ devices pose additional challenges due to their limited qubit coherence\ntimes, low number of qubits, and high error rates. Kernel methods based on\nquantum kernel estimation have emerged as a promising approach to QML on NISQ\ndevices, offering theoretical guarantees, versatility, and compatibility with\nNISQ constraints. Especially support vector machines (SVM) utilizing quantum\nkernel estimation have shown success in various supervised learning tasks.\nHowever, in the context of AD, semisupervised learning is of great relevance,\nand yet there is limited research published in this area. This paper introduces\nan approach to semisupervised AD based on the reconstruction loss of a support\nvector regression (SVR) with quantum kernel. This novel model is an alternative\nto the variational quantum and quantum kernel one-class classifiers, and is\ncompared to a quantum autoencoder as quantum baseline and a SVR with\nradial-basis-function (RBF) kernel as well as a classical autoencoder as\nclassical baselines. The models are benchmarked extensively on 10 real-world AD\ndata sets and one toy data set, and it is shown that our SVR model with quantum\nkernel performs better than the SVR with RBF kernel as well as all other\nmodels, achieving highest mean AUC over all data sets. In addition, our QSVR\noutperforms the quantum autoencoder on 9 out of 11 data sets.\n","authors":["Kilian Tscharke","Sebastian Issel","Pascal Debus"],"pdf_url":"https://arxiv.org/pdf/2308.00583v1.pdf","comment":"Accepted to IEEE International Conference on Quantum Computing and\n  Engineering (QCE) 2023"},{"id":"http://arxiv.org/abs/2103.03404v2","updated":"2023-08-01T14:27:08Z","published":"2021-03-05T00:39:05Z","title":"Attention is Not All You Need: Pure Attention Loses Rank Doubly\n  Exponentially with Depth","summary":"  Attention-based architectures have become ubiquitous in machine learning, yet\nour understanding of the reasons for their effectiveness remains limited. This\nwork proposes a new way to understand self-attention networks: we show that\ntheir output can be decomposed into a sum of smaller terms, each involving the\noperation of a sequence of attention heads across layers. Using this\ndecomposition, we prove that self-attention possesses a strong inductive bias\ntowards \"token uniformity\". Specifically, without skip connections or\nmulti-layer perceptrons (MLPs), the output converges doubly exponentially to a\nrank-1 matrix. On the other hand, skip connections and MLPs stop the output\nfrom degeneration. Our experiments verify the identified convergence phenomena\non different variants of standard transformer architectures.\n","authors":["Yihe Dong","Jean-Baptiste Cordonnier","Andreas Loukas"],"pdf_url":"https://arxiv.org/pdf/2103.03404v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09009v2","updated":"2023-08-01T14:23:58Z","published":"2023-07-18T06:56:08Z","title":"How is ChatGPT's behavior changing over time?","summary":"  GPT-3.5 and GPT-4 are the two most widely used large language model (LLM)\nservices. However, when and how these models are updated over time is opaque.\nHere, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on\nseveral diverse tasks: 1) math problems, 2) sensitive/dangerous questions, 3)\nopinion surveys, 4) multi-hop knowledge-intensive questions, 5) generating\ncode, 6) US Medical License tests, and 7) visual reasoning. We find that the\nperformance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time.\nFor example, GPT-4 (March 2023) was reasonable at identifying prime vs.\ncomposite numbers (84% accuracy) but GPT-4 (June 2023) was poor on these same\nquestions (51% accuracy). This is partly explained by a drop in GPT-4's amenity\nto follow chain-of-thought prompting. Interestingly, GPT-3.5 was much better in\nJune than in March in this task. GPT-4 became less willing to answer sensitive\nquestions and opinion survey questions in June than in March. GPT-4 performed\nbetter at multi-hop questions in June than in March, while GPT-3.5's\nperformance dropped on this task. Both GPT-4 and GPT-3.5 had more formatting\nmistakes in code generation in June than in March. Overall, our findings show\nthat the behavior of the \"same\" LLM service can change substantially in a\nrelatively short amount of time, highlighting the need for continuous\nmonitoring of LLMs.\n","authors":["Lingjiao Chen","Matei Zaharia","James Zou"],"pdf_url":"https://arxiv.org/pdf/2307.09009v2.pdf","comment":"add more evaluations"},{"id":"http://arxiv.org/abs/2211.09273v3","updated":"2023-08-01T14:13:13Z","published":"2022-11-17T00:25:05Z","title":"Privacy against Real-Time Speech Emotion Detection via Acoustic\n  Adversarial Evasion of Machine Learning","summary":"  Smart speaker voice assistants (VAs) such as Amazon Echo and Google Home have\nbeen widely adopted due to their seamless integration with smart home devices\nand the Internet of Things (IoT) technologies. These VA services raise privacy\nconcerns, especially due to their access to our speech. This work considers one\nsuch use case: the unaccountable and unauthorized surveillance of a user's\nemotion via speech emotion recognition (SER). This paper presents DARE-GP, a\nsolution that creates additive noise to mask users' emotional information while\npreserving the transcription-relevant portions of their speech. DARE-GP does\nthis by using a constrained genetic programming approach to learn the spectral\nfrequency traits that depict target users' emotional content, and then\ngenerating a universal adversarial audio perturbation that provides this\nprivacy protection. Unlike existing works, DARE-GP provides: a) real-time\nprotection of previously unheard utterances, b) against previously unseen\nblack-box SER classifiers, c) while protecting speech transcription, and d)\ndoes so in a realistic, acoustic environment. Further, this evasion is robust\nagainst defenses employed by a knowledgeable adversary. The evaluations in this\nwork culminate with acoustic evaluations against two off-the-shelf commercial\nsmart speakers using a small-form-factor (raspberry pi) integrated with a\nwake-word system to evaluate the efficacy of its real-world, real-time\ndeployment.\n","authors":["Brian Testa","Yi Xiao","Harshit Sharma","Avery Gump","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2211.09273v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00556v1","updated":"2023-08-01T13:55:45Z","published":"2023-08-01T13:55:45Z","title":"Robust Linear Regression: Phase-Transitions and Precise Tradeoffs for\n  General Norms","summary":"  In this paper, we investigate the impact of test-time adversarial attacks on\nlinear regression models and determine the optimal level of robustness that any\nmodel can reach while maintaining a given level of standard predictive\nperformance (accuracy). Through quantitative estimates, we uncover fundamental\ntradeoffs between adversarial robustness and accuracy in different regimes. We\nobtain a precise characterization which distinguishes between regimes where\nrobustness is achievable without hurting standard accuracy and regimes where a\ntradeoff might be unavoidable. Our findings are empirically confirmed with\nsimple experiments that represent a variety of settings. This work applies to\nfeature covariance matrices and attack norms of any nature, and extends beyond\nprevious works in this area.\n","authors":["Elvis Dohmatob","Meyer Scetbon"],"pdf_url":"https://arxiv.org/pdf/2308.00556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00549v1","updated":"2023-08-01T13:45:04Z","published":"2023-08-01T13:45:04Z","title":"Copula for Instance-wise Feature Selection and Ranking","summary":"  Instance-wise feature selection and ranking methods can achieve a good\nselection of task-friendly features for each sample in the context of neural\nnetworks. However, existing approaches that assume feature subsets to be\nindependent are imperfect when considering the dependency between features. To\naddress this limitation, we propose to incorporate the Gaussian copula, a\npowerful mathematical technique for capturing correlations between variables,\ninto the current feature selection framework with no additional changes needed.\nExperimental results on both synthetic and real datasets, in terms of\nperformance comparison and interpretability, demonstrate that our method is\ncapable of capturing meaningful correlations.\n","authors":["Hanyu Peng","Guanhua Fang","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2308.00549v1.pdf","comment":"15 pages, UAI poster"},{"id":"http://arxiv.org/abs/2307.09269v2","updated":"2023-08-01T13:43:43Z","published":"2023-07-18T13:52:12Z","title":"End-to-End Neural Network Training for Hyperbox-Based Classification","summary":"  Hyperbox-based classification has been seen as a promising technique in which\ndecisions on the data are represented as a series of orthogonal,\nmultidimensional boxes (i.e., hyperboxes) that are often interpretable and\nhuman-readable. However, existing methods are no longer capable of efficiently\nhandling the increasing volume of data many application domains face nowadays.\nWe address this gap by proposing a novel, fully differentiable framework for\nhyperbox-based classification via neural networks. In contrast to previous\nwork, our hyperbox models can be efficiently trained in an end-to-end fashion,\nwhich leads to significantly reduced training times and superior classification\nresults.\n","authors":["Denis Mayr Lima Martins","Christian Lülf","Fabian Gieseke"],"pdf_url":"https://arxiv.org/pdf/2307.09269v2.pdf","comment":"6 pages, accepted for poster presentation at ESANN 2023"},{"id":"http://arxiv.org/abs/2307.11224v2","updated":"2023-08-01T13:40:31Z","published":"2023-07-20T20:37:24Z","title":"Jina Embeddings: A Novel Set of High-Performance Sentence Embedding\n  Models","summary":"  Jina Embeddings constitutes a set of high-performance sentence embedding\nmodels adept at translating various textual inputs into numerical\nrepresentations, thereby capturing the semantic essence of the text. The models\nexcel in applications such as dense retrieval and semantic textual similarity.\nThis paper details the development of Jina Embeddings, starting with the\ncreation of high-quality pairwise and triplet datasets. It underlines the\ncrucial role of data cleaning in dataset preparation, gives in-depth insights\ninto the model training process, and concludes with a comprehensive performance\nevaluation using the Massive Textual Embedding Benchmark (MTEB). To increase\nthe model's awareness of negations, we constructed a novel training and\nevaluation dataset of negated and non-negated statements, which we make\npublicly available to the community.\n","authors":["Michael Günther","Louis Milliken","Jonathan Geuter","Georgios Mastrapas","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.11224v2.pdf","comment":"9 pages, 2 page appendix"},{"id":"http://arxiv.org/abs/2308.00539v1","updated":"2023-08-01T13:32:07Z","published":"2023-08-01T13:32:07Z","title":"Predicting Early Dropouts of an Active and Healthy Ageing App","summary":"  In this work, we present a machine learning approach for predicting early\ndropouts of an active and healthy ageing app. The presented algorithms have\nbeen submitted to the IFMBE Scientific Challenge 2022, part of IUPESM WC 2022.\nWe have processed the given database and generated seven datasets. We used\npre-processing techniques to construct classification models that predict the\nadherence of users using dynamic and static features. We submitted 11 official\nruns and our results show that machine learning algorithms can provide\nhigh-quality adherence predictions. Based on the results, the dynamic features\npositively influence a model's classification performance. Due to the\nimbalanced nature of the dataset, we employed oversampling methods such as\nSMOTE and ADASYN to improve the classification performance. The oversampling\napproaches led to a remarkable improvement of 10\\%. Our methods won first place\nin the IFMBE Scientific Challenge 2022.\n","authors":["Vasileios Perifanis","Ioanna Michailidi","Giorgos Stamatelatos","George Drosatos","Pavlos S. Efraimidis"],"pdf_url":"https://arxiv.org/pdf/2308.00539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1905.02616v3","updated":"2023-08-01T13:31:35Z","published":"2019-05-07T14:40:32Z","title":"An Integrated Multi-Time-Scale Modeling for Solar Irradiance Forecasting\n  Using Deep Learning","summary":"  For short-term solar irradiance forecasting, the traditional point\nforecasting methods are rendered less useful due to the non-stationary\ncharacteristic of solar power. The amount of operating reserves required to\nmaintain reliable operation of the electric grid rises due to the variability\nof solar energy. The higher the uncertainty in the generation, the greater the\noperating-reserve requirements, which translates to an increased cost of\noperation. In this research work, we propose a unified architecture for\nmulti-time-scale predictions for intra-day solar irradiance forecasting using\nrecurrent neural networks (RNN) and long-short-term memory networks (LSTMs).\nThis paper also lays out a framework for extending this modeling approach to\nintra-hour forecasting horizons thus, making it a multi-time-horizon\nforecasting approach, capable of predicting intra-hour as well as intra-day\nsolar irradiance. We develop an end-to-end pipeline to effectuate the proposed\narchitecture. The performance of the prediction model is tested and validated\nby the methodical implementation. The robustness of the approach is\ndemonstrated with case studies conducted for geographically scattered sites\nacross the United States. The predictions demonstrate that our proposed unified\narchitecture-based approach is effective for multi-time-scale solar forecasts\nand achieves a lower root-mean-square prediction error when benchmarked against\nthe best-performing methods documented in the literature that use separate\nmodels for each time-scale during the day. Our proposed method results in a\n71.5% reduction in the mean RMSE averaged across all the test sites compared to\nthe ML-based best-performing method reported in the literature. Additionally,\nthe proposed method enables multi-time-horizon forecasts with real-time inputs,\nwhich have a significant potential for practical industry applications in the\nevolving grid.\n","authors":["Sakshi Mishra","Praveen Palanisamy"],"pdf_url":"https://arxiv.org/pdf/1905.02616v3.pdf","comment":"19 pages, 12 figures, 3 tables, under review for journal submission"},{"id":"http://arxiv.org/abs/2308.00537v1","updated":"2023-08-01T13:30:36Z","published":"2023-08-01T13:30:36Z","title":"Graph Embedding Dynamic Feature-based Supervised Contrastive Learning of\n  Transient Stability for Changing Power Grid Topologies","summary":"  Accurate online transient stability prediction is critical for ensuring power\nsystem stability when facing disturbances. While traditional transient stablity\nanalysis replies on the time domain simulations can not be quickly adapted to\nthe power grid toplogy change. In order to vectorize high-dimensional power\ngrid topological structure information into low-dimensional node-based graph\nembedding streaming data, graph embedding dynamic feature (GEDF) has been\nproposed. The transient stability GEDF-based supervised contrastive learning\n(GEDF-SCL) model uses supervised contrastive learning to predict transient\nstability with GEDFs, considering power grid topology information. To evaluate\nthe performance of the proposed GEDF-SCL model, power grids of varying\ntopologies were generated based on the IEEE 39-bus system model. Transient\noperational data was obtained by simulating N-1 and N-$\\bm{m}$-1 contingencies\non these generated power system topologies. Test result demonstrated that the\nGEDF-SCL model can achieve high accuracy in transient stability prediction and\nadapt well to changing power grid topologies.\n","authors":["Zijian Lv","Xin Chen","Zijian Feng"],"pdf_url":"https://arxiv.org/pdf/2308.00537v1.pdf","comment":"This work has been submitted to the IEEE Transactions on Power\n  Systems for possible publication. Copyright may be transferred without\n  notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2308.00535v1","updated":"2023-08-01T13:28:24Z","published":"2023-08-01T13:28:24Z","title":"Graph Contrastive Learning with Generative Adversarial Network","summary":"  Graph Neural Networks (GNNs) have demonstrated promising results on\nexploiting node representations for many downstream tasks through supervised\nend-to-end training. To deal with the widespread label scarcity issue in\nreal-world applications, Graph Contrastive Learning (GCL) is leveraged to train\nGNNs with limited or even no labels by maximizing the mutual information\nbetween nodes in its augmented views generated from the original graph.\nHowever, the distribution of graphs remains unconsidered in view generation,\nresulting in the ignorance of unseen edges in most existing literature, which\nis empirically shown to be able to improve GCL's performance in our\nexperiments. To this end, we propose to incorporate graph generative\nadversarial networks (GANs) to learn the distribution of views for GCL, in\norder to i) automatically capture the characteristic of graphs for\naugmentations, and ii) jointly train the graph GAN model and the GCL model.\nSpecifically, we present GACN, a novel Generative Adversarial Contrastive\nlearning Network for graph representation learning. GACN develops a view\ngenerator and a view discriminator to generate augmented views automatically in\nan adversarial style. Then, GACN leverages these views to train a GNN encoder\nwith two carefully designed self-supervised learning losses, including the\ngraph contrastive loss and the Bayesian personalized ranking Loss. Furthermore,\nwe design an optimization framework to train all GACN modules jointly.\nExtensive experiments on seven real-world datasets show that GACN is able to\ngenerate high-quality augmented views for GCL and is superior to twelve\nstate-of-the-art baseline methods. Noticeably, our proposed GACN surprisingly\ndiscovers that the generated views in data augmentation finally conform to the\nwell-known preferential attachment rule in online networks.\n","authors":["Cheng Wu","Chaokun Wang","Jingcao Xu","Ziyang Liu","Kai Zheng","Xiaowei Wang","Yang Song","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2308.00535v1.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2308.00533v1","updated":"2023-08-01T13:26:59Z","published":"2023-08-01T13:26:59Z","title":"A Novel Temporal Multi-Gate Mixture-of-Experts Approach for Vehicle\n  Trajectory and Driving Intention Prediction","summary":"  Accurate Vehicle Trajectory Prediction is critical for automated vehicles and\nadvanced driver assistance systems. Vehicle trajectory prediction consists of\ntwo essential tasks, i.e., longitudinal position prediction and lateral\nposition prediction. There is a significant correlation between driving\nintentions and vehicle motion. In existing work, the three tasks are often\nconducted separately without considering the relationships between the\nlongitudinal position, lateral position, and driving intention. In this paper,\nwe propose a novel Temporal Multi-Gate Mixture-of-Experts (TMMOE) model for\nsimultaneously predicting the vehicle trajectory and driving intention. The\nproposed model consists of three layers: a shared layer, an expert layer, and a\nfully connected layer. In the model, the shared layer utilizes Temporal\nConvolutional Networks (TCN) to extract temporal features. Then the expert\nlayer is built to identify different information according to the three tasks.\nMoreover, the fully connected layer is used to integrate and export prediction\nresults. To achieve better performance, uncertainty algorithm is used to\nconstruct the multi-task loss function. Finally, the publicly available CitySim\ndataset validates the TMMOE model, demonstrating superior performance compared\nto the LSTM model, achieving the highest classification and regression results.\nKeywords: Vehicle trajectory prediction, driving intentions Classification,\nMulti-task\n","authors":["Renteng Yuan","Mohamed Abdel-Aty","Qiaojun Xiang","Zijin Wang","Ou Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.00533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.05069v2","updated":"2023-08-01T13:20:41Z","published":"2022-02-10T14:57:15Z","title":"Transfer-Learning Across Datasets with Different Input Dimensions: An\n  Algorithm and Analysis for the Linear Regression Case","summary":"  With the development of new sensors and monitoring devices, more sources of\ndata become available to be used as inputs for machine learning models. These\ncan on the one hand help to improve the accuracy of a model. On the other hand\nhowever, combining these new inputs with historical data remains a challenge\nthat has not yet been studied in enough detail. In this work, we propose a\ntransfer-learning algorithm that combines the new and the historical data, that\nis especially beneficial when the new data is scarce. We focus the approach on\nthe linear regression case, which allows us to conduct a rigorous theoretical\nstudy on the benefits of the approach. We show that our approach is robust\nagainst negative transfer-learning, and we confirm this result empirically with\nreal and simulated data.\n","authors":["Luis Pedro Silvestrin","Harry van Zanten","Mark Hoogendoorn","Ger Koole"],"pdf_url":"https://arxiv.org/pdf/2202.05069v2.pdf","comment":"Code available at\n  https://github.com/lpsilvestrin/incremental_input_tl"},{"id":"http://arxiv.org/abs/2307.15539v2","updated":"2023-08-01T13:18:18Z","published":"2023-07-28T13:07:42Z","title":"Beating Backdoor Attack at Its Own Game","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not\naffect the network's performance on clean data but would manipulate the network\nbehavior once a trigger pattern is added. Existing defense methods have greatly\nreduced attack success rate, but their prediction accuracy on clean data still\nlags behind a clean model by a large margin. Inspired by the stealthiness and\neffectiveness of backdoor attack, we propose a simple but highly effective\ndefense framework which injects non-adversarial backdoors targeting poisoned\nsamples. Following the general steps in backdoor attack, we detect a small set\nof suspected samples and then apply a poisoning strategy to them. The\nnon-adversarial backdoor, once triggered, suppresses the attacker's backdoor on\npoisoned data, but has limited influence on clean data. The defense can be\ncarried out during data preprocessing, without any modification to the standard\nend-to-end training pipeline. We conduct extensive experiments on multiple\nbenchmarks with different architectures and representative attacks. Results\ndemonstrate that our method achieves state-of-the-art defense effectiveness\nwith by far the lowest performance drop on clean data. Considering the\nsurprising defense ability displayed by our framework, we call for more\nattention to utilizing backdoor for backdoor defense. Code is available at\nhttps://github.com/damianliumin/non-adversarial_backdoor.\n","authors":["Min Liu","Alberto Sangiovanni-Vincentelli","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.15539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00529v1","updated":"2023-08-01T13:15:58Z","published":"2023-08-01T13:15:58Z","title":"Variational Label-Correlation Enhancement for Congestion Prediction","summary":"  The physical design process of large-scale designs is a time-consuming task,\noften requiring hours to days to complete, with routing being the most critical\nand complex step. As the the complexity of Integrated Circuits (ICs) increases,\nthere is an increased demand for accurate routing quality prediction. Accurate\ncongestion prediction aids in identifying design flaws early on, thereby\naccelerating circuit design and conserving resources. Despite the advancements\nin current congestion prediction methodologies, an essential aspect that has\nbeen largely overlooked is the spatial label-correlation between different\ngrids in congestion prediction. The spatial label-correlation is a fundamental\ncharacteristic of circuit design, where the congestion status of a grid is not\nisolated but inherently influenced by the conditions of its neighboring grids.\nIn order to fully exploit the inherent spatial label-correlation between\nneighboring grids, we propose a novel approach, {\\ours}, i.e., VAriational\nLabel-Correlation Enhancement for Congestion Prediction, which considers the\nlocal label-correlation in the congestion map, associating the estimated\ncongestion value of each grid with a local label-correlation weight influenced\nby its surrounding grids. {\\ours} leverages variational inference techniques to\nestimate this weight, thereby enhancing the regression model's performance by\nincorporating spatial dependencies. Experiment results validate the superior\neffectiveness of {\\ours} on the public available \\texttt{ISPD2011} and\n\\texttt{DAC2012} benchmarks using the superblue circuit line.\n","authors":["Biao Liu","Congyu Qiao","Ning Xu","Xin Geng","Ziran Zhu","Jun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.00529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03638v2","updated":"2023-08-01T13:14:53Z","published":"2023-06-04T11:31:41Z","title":"Provable convergence guarantees for black-box variational inference","summary":"  While black-box variational inference is widely used, there is no proof that\nits stochastic optimization succeeds. We suggest this is due to a theoretical\ngap in existing stochastic optimization proofs-namely the challenge of gradient\nestimators with unusual noise bounds, and a composite non-smooth objective. For\ndense Gaussian variational families, we observe that existing gradient\nestimators based on reparameterization satisfy a quadratic noise bound and give\nnovel convergence guarantees for proximal and projected stochastic gradient\ndescent using this bound. This provides the first rigorous guarantee that\nblack-box variational inference converges for realistic inference problems.\n","authors":["Justin Domke","Guillaume Garrigos","Robert Gower"],"pdf_url":"https://arxiv.org/pdf/2306.03638v2.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2303.03237v3","updated":"2023-08-01T13:09:53Z","published":"2023-03-06T15:53:44Z","title":"Convergence Rates for Non-Log-Concave Sampling and Log-Partition\n  Estimation","summary":"  Sampling from Gibbs distributions $p(x) \\propto \\exp(-V(x)/\\varepsilon)$ and\ncomputing their log-partition function are fundamental tasks in statistics,\nmachine learning, and statistical physics. However, while efficient algorithms\nare known for convex potentials $V$, the situation is much more difficult in\nthe non-convex case, where algorithms necessarily suffer from the curse of\ndimensionality in the worst case. For optimization, which can be seen as a\nlow-temperature limit of sampling, it is known that smooth functions $V$ allow\nfaster convergence rates. Specifically, for $m$-times differentiable functions\nin $d$ dimensions, the optimal rate for algorithms with $n$ function\nevaluations is known to be $O(n^{-m/d})$, where the constant can potentially\ndepend on $m, d$ and the function to be optimized. Hence, the curse of\ndimensionality can be alleviated for smooth functions at least in terms of the\nconvergence rate. Recently, it has been shown that similarly fast rates can\nalso be achieved with polynomial runtime $O(n^{3.5})$, where the exponent $3.5$\nis independent of $m$ or $d$. Hence, it is natural to ask whether similar rates\nfor sampling and log-partition computation are possible, and whether they can\nbe realized in polynomial time with an exponent independent of $m$ and $d$. We\nshow that the optimal rates for sampling and log-partition computation are\nsometimes equal and sometimes faster than for optimization. We then analyze\nvarious polynomial-time sampling algorithms, including an extension of a recent\npromising optimization approach, and find that they sometimes exhibit\ninteresting behavior but no near-optimal rates. Our results also give further\ninsights on the relation between sampling, log-partition, and optimization\nproblems.\n","authors":["David Holzmüller","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2303.03237v3.pdf","comment":"Changes in v3: Minor corrections and improvements. Plots can be\n  reproduced using the code at\n  https://github.com/dholzmueller/sampling_experiments"},{"id":"http://arxiv.org/abs/2203.09410v4","updated":"2023-08-01T13:05:32Z","published":"2022-03-17T16:11:36Z","title":"A Framework and Benchmark for Deep Batch Active Learning for Regression","summary":"  The acquisition of labels for supervised learning can be expensive. To\nimprove the sample efficiency of neural network regression, we study active\nlearning methods that adaptively select batches of unlabeled data for labeling.\nWe present a framework for constructing such methods out of (network-dependent)\nbase kernels, kernel transformations, and selection methods. Our framework\nencompasses many existing Bayesian methods based on Gaussian process\napproximations of neural networks as well as non-Bayesian methods.\nAdditionally, we propose to replace the commonly used last-layer features with\nsketched finite-width neural tangent kernels and to combine them with a novel\nclustering method. To evaluate different methods, we introduce an open-source\nbenchmark consisting of 15 large tabular regression data sets. Our proposed\nmethod outperforms the state-of-the-art on our benchmark, scales to large data\nsets, and works out-of-the-box without adjusting the network architecture or\ntraining code. We provide open-source code that includes efficient\nimplementations of all kernels, kernel transformations, and selection methods,\nand can be used for reproducing our results.\n","authors":["David Holzmüller","Viktor Zaverkin","Johannes Kästner","Ingo Steinwart"],"pdf_url":"https://arxiv.org/pdf/2203.09410v4.pdf","comment":"Published at the Journal of Machine Learning Research (JMLR). Changes\n  in v4: Improvements in writing and other minor changes. Accompanying code can\n  be found at https://github.com/dholzmueller/bmdal_reg"},{"id":"http://arxiv.org/abs/2307.15396v2","updated":"2023-08-01T12:56:29Z","published":"2023-07-28T08:41:12Z","title":"Noisy Interpolation Learning with Shallow Univariate ReLU Networks","summary":"  We study the asymptotic overfitting behavior of interpolation with minimum\nnorm ($\\ell_2$ of the weights) two-layer ReLU networks for noisy univariate\nregression. We show that overfitting is tempered for the $L_1$ loss, and any\n$L_p$ loss for $p<2$, but catastrophic for $p\\geq 2$.\n","authors":["Nirmit Joshi","Gal Vardi","Nathan Srebro"],"pdf_url":"https://arxiv.org/pdf/2307.15396v2.pdf","comment":"Added a reference to a related paper"},{"id":"http://arxiv.org/abs/2308.00507v1","updated":"2023-08-01T12:46:02Z","published":"2023-08-01T12:46:02Z","title":"Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT\n  by Integrating Neural Distance and Texture-Aware Transformer","summary":"  Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which\nthe tumor-vascular involvement greatly affects the resectability and, thus,\noverall survival of patients. However, current prognostic prediction methods\nfail to explicitly and accurately investigate relationships between the tumor\nand nearby important vessels. This paper proposes a novel learnable neural\ndistance that describes the precise relationship between the tumor and vessels\nin CT images of different patients, adopting it as a major feature for\nprognosis prediction. Besides, different from existing models that used CNNs or\nLSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT\nimaging, we improved the extraction of dynamic tumor-related texture features\nin multi-phase contrast-enhanced CT by fusing local and global features using\nCNN and transformer modules, further enhancing the features extracted across\nmulti-phase CT images. We extensively evaluated and compared the proposed\nmethod with existing methods in the multi-center (n=4) dataset with 1,070\npatients with PDAC, and statistical analysis confirmed its clinical\neffectiveness in the external test set consisting of three centers. The\ndeveloped risk marker was the strongest predictor of overall survival among\npreoperative factors and it has the potential to be combined with established\nclinical factors to select patients at higher risk who might benefit from\nneoadjuvant therapy.\n","authors":["Hexin Dong","Jiawen Yao","Yuxing Tang","Mingze Yuan","Yingda Xia","Jian Zhou","Hong Lu","Jingren Zhou","Bin Dong","Le Lu","Li Zhang","Zaiyi Liu","Yu Shi","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00507v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2304.11435v2","updated":"2023-08-01T12:43:40Z","published":"2023-04-22T15:46:58Z","title":"Hyper-Laplacian Regularized Concept Factorization in Low-rank Tensor\n  Space for Multi-view Clustering","summary":"  Tensor-oriented multi-view subspace clustering has achieved significant\nstrides in assessing high-order correlations and improving clustering analysis\nof multi-view data. Nevertheless, most of existing investigations are typically\nhampered by the two flaws. First, self-representation based tensor subspace\nlearning usually induces high time and space complexity, and is limited in\nperceiving nonlinear local structure in the embedding space. Second, the tensor\nsingular value decomposition (t-SVD) model redistributes each singular value\nequally without considering the diverse importance among them. To well cope\nwith the issues, we propose a hyper-Laplacian regularized concept factorization\n(HLRCF) in low-rank tensor space for multi-view clustering. Specifically, we\nadopt the concept factorization to explore the latent cluster-wise\nrepresentation of each view. Further, the hypergraph Laplacian regularization\nendows the model with the capability of extracting the nonlinear local\nstructures in the latent space. Considering that different tensor singular\nvalues associate structural information with unequal importance, we develop a\nself-weighted tensor Schatten p-norm to constrain the tensor comprised of all\ncluster-wise representations. Notably, the tensor with smaller size greatly\ndecreases the time and space complexity in the low-rank optimization. Finally,\nexperimental results on eight benchmark datasets exhibit that HLRCF outperforms\nother multi-view methods, showingcasing its superior performance.\n","authors":["Zixiao Yu","Lele Fu","Zhiling Cai","Zhoumin Lu"],"pdf_url":"https://arxiv.org/pdf/2304.11435v2.pdf","comment":"We intend to continue refining the theoretical analysis and\n  experimental validation"},{"id":"http://arxiv.org/abs/2308.00504v1","updated":"2023-08-01T12:39:42Z","published":"2023-08-01T12:39:42Z","title":"Explainable Graph Spectral Clustering of Text Documents","summary":"  Spectral clustering methods are known for their ability to represent clusters\nof diverse shapes, densities etc. However, results of such algorithms, when\napplied e.g. to text documents, are hard to explain to the user, especially due\nto embedding in the spectral space which has no obvious relation to document\ncontents. Therefore there is an urgent need to elaborate methods for explaining\nthe outcome of the clustering. This paper presents a contribution towards this\ngoal. We present a proposal of explanation of results of combinatorial\nLaplacian based graph spectral clustering. It is based on showing (approximate)\nequivalence of combinatorial Laplacian embedding, $K$-embedding (proposed in\nthis paper) and term vector space embedding. Hence a bridge is constructed\nbetween the textual contents and the clustering results. We provide theoretical\nbackground for this approach. We performed experimental study showing that\n$K$-embedding approximates well Laplacian embedding under favourable block\nmatrix conditions and show that approximation is good enough under other\nconditions.\n","authors":["Bartłomiej Starosta","Mieczysław A. Kłopotek","Sławomir T. Wierzchoń"],"pdf_url":"https://arxiv.org/pdf/2308.00504v1.pdf","comment":"4 figures, 15 tables"},{"id":"http://arxiv.org/abs/2010.01851v8","updated":"2023-08-01T12:36:42Z","published":"2020-10-05T08:30:25Z","title":"On the Universality of the Double Descent Peak in Ridgeless Regression","summary":"  We prove a non-asymptotic distribution-independent lower bound for the\nexpected mean squared generalization error caused by label noise in ridgeless\nlinear regression. Our lower bound generalizes a similar known result to the\noverparameterized (interpolating) regime. In contrast to most previous works,\nour analysis applies to a broad class of input distributions with almost surely\nfull-rank feature matrices, which allows us to cover various types of\ndeterministic or random feature maps. Our lower bound is asymptotically sharp\nand implies that in the presence of label noise, ridgeless linear regression\ndoes not perform well around the interpolation threshold for any of these\nfeature maps. We analyze the imposed assumptions in detail and provide a theory\nfor analytic (random) feature maps. Using this theory, we can show that our\nassumptions are satisfied for input distributions with a (Lebesgue) density and\nfeature maps given by random deep neural networks with analytic activation\nfunctions like sigmoid, tanh, softplus or GELU. As further examples, we show\nthat feature maps from random Fourier features and polynomial kernels also\nsatisfy our assumptions. We complement our theory with further experimental and\nanalytic results.\n","authors":["David Holzmüller"],"pdf_url":"https://arxiv.org/pdf/2010.01851v8.pdf","comment":"Published at ICLR 2021. 9 pages + 34 pages appendix. Changes in v8:\n  Small corrections. Experimental results can be reproduced using the code at\n  https://github.com/dholzmueller/universal_double_descent"},{"id":"http://arxiv.org/abs/2305.15920v2","updated":"2023-08-01T12:23:25Z","published":"2023-05-25T10:41:02Z","title":"Accurate generation of stochastic dynamics based on multi-model\n  Generative Adversarial Networks","summary":"  Generative Adversarial Networks (GANs) have shown immense potential in fields\nsuch as text and image generation. Only very recently attempts to exploit GANs\nto statistical-mechanics models have been reported. Here we quantitatively test\nthis approach by applying it to a prototypical stochastic process on a lattice.\nBy suitably adding noise to the original data we succeed in bringing both the\nGenerator and the Discriminator loss functions close to their ideal value.\nImportantly, the discreteness of the model is retained despite the noise. As\ntypical for adversarial approaches, oscillations around the convergence limit\npersist also at large epochs. This undermines model selection and the quality\nof the generated trajectories. We demonstrate that a simple multi-model\nprocedure where stochastic trajectories are advanced at each step upon randomly\nselecting a Generator leads to a remarkable increase in accuracy. This is\nillustrated by quantitative analysis of both the predicted equilibrium\nprobability distribution and of the escape-time distribution. Based on the\nreported findings, we believe that GANs are a promising tool to tackle complex\nstatistical dynamics by machine learning techniques\n","authors":["Daniele Lanzoni","Olivier Pierre-Louis","Francesco Montalenti"],"pdf_url":"https://arxiv.org/pdf/2305.15920v2.pdf","comment":"Main text and appendices, 10 pages and 10 figures Updated version:\n  citations to previous work which was not known to the authors have been\n  added, text has been re-organized and modified accordingly; supplemental\n  material has been moved into appendices"},{"id":"http://arxiv.org/abs/2206.14163v2","updated":"2023-08-01T12:18:55Z","published":"2022-06-28T17:20:38Z","title":"Verifiable Goal Recognition for Autonomous Driving with Occlusions","summary":"  Goal recognition (GR) involves inferring the goals of other vehicles, such as\na certain junction exit, which can enable more accurate prediction of their\nfuture behaviour. In autonomous driving, vehicles can encounter many different\nscenarios and the environment may be partially observable due to occlusions. We\npresent a novel GR method named Goal Recognition with Interpretable Trees under\nOcclusion (OGRIT). OGRIT uses decision trees learned from vehicle trajectory\ndata to infer the probabilities of a set of generated goals. We demonstrate\nthat OGRIT can handle missing data due to occlusions and make inferences across\nmultiple scenarios using the same learned decision trees, while being\ncomputationally fast, accurate, interpretable and verifiable. We also release\nthe inDO, rounDO and OpenDDO datasets of occluded regions used to evaluate\nOGRIT.\n","authors":["Cillian Brewitt","Massimiliano Tamborski","Cheng Wang","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2206.14163v2.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n  (IROS), 2023"},{"id":"http://arxiv.org/abs/2106.08199v2","updated":"2023-08-01T12:02:58Z","published":"2021-06-15T14:59:14Z","title":"On Multi-objective Policy Optimization as a Tool for Reinforcement\n  Learning: Case Studies in Offline RL and Finetuning","summary":"  Many advances that have improved the robustness and efficiency of deep\nreinforcement learning (RL) algorithms can, in one way or another, be\nunderstood as introducing additional objectives or constraints in the policy\noptimization step. This includes ideas as far ranging as exploration bonuses,\nentropy regularization, and regularization toward teachers or data priors.\nOften, the task reward and auxiliary objectives are in conflict, and in this\npaper we argue that this makes it natural to treat these cases as instances of\nmulti-objective (MO) optimization problems. We demonstrate how this perspective\nallows us to develop novel and more effective RL algorithms. In particular, we\nfocus on offline RL and finetuning as case studies, and show that existing\napproaches can be understood as MO algorithms relying on linear scalarization.\nWe hypothesize that replacing linear scalarization with a better algorithm can\nimprove performance. We introduce Distillation of a Mixture of Experts (DiME),\na new MORL algorithm that outperforms linear scalarization and can be applied\nto these non-standard MO problems. We demonstrate that for offline RL, DiME\nleads to a simple new algorithm that outperforms state-of-the-art. For\nfinetuning, we derive new algorithms that learn to outperform the teacher\npolicy.\n","authors":["Abbas Abdolmaleki","Sandy H. Huang","Giulia Vezzani","Bobak Shahriari","Jost Tobias Springenberg","Shruti Mishra","Dhruva TB","Arunkumar Byravan","Konstantinos Bousmalis","Andras Gyorgy","Csaba Szepesvari","Raia Hadsell","Nicolas Heess","Martin Riedmiller"],"pdf_url":"https://arxiv.org/pdf/2106.08199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00475v1","updated":"2023-08-01T11:58:49Z","published":"2023-08-01T11:58:49Z","title":"DINO-CXR: A self supervised method based on vision transformer for chest\n  X-ray classification","summary":"  The limited availability of labeled chest X-ray datasets is a significant\nbottleneck in the development of medical imaging methods. Self-supervised\nlearning (SSL) can mitigate this problem by training models on unlabeled data.\nFurthermore, self-supervised pretraining has yielded promising results in\nvisual recognition of natural images but has not been given much consideration\nin medical image analysis. In this work, we propose a self-supervised method,\nDINO-CXR, which is a novel adaptation of a self-supervised method, DINO, based\non a vision transformer for chest X-ray classification. A comparative analysis\nis performed to show the effectiveness of the proposed method for both\npneumonia and COVID-19 detection. Through a quantitative analysis, it is also\nshown that the proposed method outperforms state-of-the-art methods in terms of\naccuracy and achieves comparable results in terms of AUC and F-1 score while\nrequiring significantly less labeled data.\n","authors":["Mohammadreza Shakouri","Fatemeh Iranmanesh","Mahdi Eftekhari"],"pdf_url":"https://arxiv.org/pdf/2308.00475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00473v1","updated":"2023-08-01T11:54:34Z","published":"2023-08-01T11:54:34Z","title":"Is Last Layer Re-Training Truly Sufficient for Robustness to Spurious\n  Correlations?","summary":"  Models trained with empirical risk minimization (ERM) are known to learn to\nrely on spurious features, i.e., their prediction is based on undesired\nauxiliary features which are strongly correlated with class labels but lack\ncausal reasoning. This behavior particularly degrades accuracy in groups of\nsamples of the correlated class that are missing the spurious feature or\nsamples of the opposite class but with the spurious feature present. The\nrecently proposed Deep Feature Reweighting (DFR) method improves accuracy of\nthese worst groups. Based on the main argument that ERM mods can learn core\nfeatures sufficiently well, DFR only needs to retrain the last layer of the\nclassification model with a small group-balanced data set. In this work, we\nexamine the applicability of DFR to realistic data in the medical domain.\nFurthermore, we investigate the reasoning behind the effectiveness of\nlast-layer retraining and show that even though DFR has the potential to\nimprove the accuracy of the worst group, it remains susceptible to spurious\ncorrelations.\n","authors":["Phuong Quynh Le","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2308.00473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15809v2","updated":"2023-08-01T11:53:27Z","published":"2023-03-28T08:28:39Z","title":"Kernel interpolation generalizes poorly","summary":"  One of the most interesting problems in the recent renaissance of the studies\nin kernel regression might be whether the kernel interpolation can generalize\nwell, since it may help us understand the `benign overfitting henomenon'\nreported in the literature on deep networks. In this paper, under mild\nconditions, we show that for any $\\varepsilon>0$, the generalization error of\nkernel interpolation is lower bounded by $\\Omega(n^{-\\varepsilon})$. In other\nwords, the kernel interpolation generalizes poorly for a large class of\nkernels. As a direct corollary, we can show that overfitted wide neural\nnetworks defined on the sphere generalize poorly.\n","authors":["Yicheng Li","Haobo Zhang","Qian Lin"],"pdf_url":"https://arxiv.org/pdf/2303.15809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12845v2","updated":"2023-08-01T11:50:21Z","published":"2023-04-25T14:18:12Z","title":"(Local) Differential Privacy has NO Disparate Impact on Fairness","summary":"  In recent years, Local Differential Privacy (LDP), a robust\nprivacy-preserving methodology, has gained widespread adoption in real-world\napplications. With LDP, users can perturb their data on their devices before\nsending it out for analysis. However, as the collection of multiple sensitive\ninformation becomes more prevalent across various industries, collecting a\nsingle sensitive attribute under LDP may not be sufficient. Correlated\nattributes in the data may still lead to inferences about the sensitive\nattribute. This paper empirically studies the impact of collecting multiple\nsensitive attributes under LDP on fairness. We propose a novel privacy budget\nallocation scheme that considers the varying domain size of sensitive\nattributes. This generally led to a better privacy-utility-fairness trade-off\nin our experiments than the state-of-art solution. Our results show that LDP\nleads to slightly improved fairness in learning problems without significantly\naffecting the performance of the models. We conduct extensive experiments\nevaluating three benchmark datasets using several group fairness metrics and\nseven state-of-the-art LDP protocols. Overall, this study challenges the common\nbelief that differential privacy necessarily leads to worsened fairness in\nmachine learning.\n","authors":["Héber H. Arcolezi","Karima Makhlouf","Catuscia Palamidessi"],"pdf_url":"https://arxiv.org/pdf/2304.12845v2.pdf","comment":"Best paper award at DBSec'23. Version of record at\n  https://doi.org/10.1007/978-3-031-37586-6_1"},{"id":"http://arxiv.org/abs/2308.00469v1","updated":"2023-08-01T11:45:24Z","published":"2023-08-01T11:45:24Z","title":"Mirror Natural Evolution Strategies","summary":"  The zeroth-order optimization has been widely used in machine learning\napplications. However, the theoretical study of the zeroth-order optimization\nfocus on the algorithms which approximate (first-order) gradients using\n(zeroth-order) function value difference at a random direction. The theory of\nalgorithms which approximate the gradient and Hessian information by\nzeroth-order queries is much less studied. In this paper, we focus on the\ntheory of zeroth-order optimization which utilizes both the first-order and\nsecond-order information approximated by the zeroth-order queries. We first\npropose a novel reparameterized objective function with parameters $(\\mu,\n\\Sigma)$. This reparameterized objective function achieves its optimum at the\nminimizer and the Hessian inverse of the original objective function\nrespectively, but with small perturbations. Accordingly, we propose a new\nalgorithm to minimize our proposed reparameterized objective, which we call\n\\texttt{MiNES} (mirror descent natural evolution strategy). We show that the\nestimated covariance matrix of \\texttt{MiNES} converges to the inverse of\nHessian matrix of the objective function with a convergence rate\n$\\widetilde{\\mathcal{O}}(1/k)$, where $k$ is the iteration number and\n$\\widetilde{\\mathcal{O}}(\\cdot)$ hides the constant and $\\log$ terms. We also\nprovide the explicit convergence rate of \\texttt{MiNES} and how the covariance\nmatrix promotes the convergence rate.\n","authors":["Haishan Ye"],"pdf_url":"https://arxiv.org/pdf/2308.00469v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:1910.11490"},{"id":"http://arxiv.org/abs/2306.14744v2","updated":"2023-08-01T11:42:22Z","published":"2023-06-26T14:59:56Z","title":"ChiPFormer: Transferable Chip Placement via Offline Decision Transformer","summary":"  Placement is a critical step in modern chip design, aiming to determine the\npositions of circuit modules on the chip canvas. Recent works have shown that\nreinforcement learning (RL) can improve human performance in chip placement.\nHowever, such an RL-based approach suffers from long training time and low\ntransfer ability in unseen chip circuits. To resolve these challenges, we cast\nthe chip placement as an offline RL formulation and present ChiPFormer that\nenables learning a transferable placement policy from fixed offline data.\nChiPFormer has several advantages that prior arts do not have. First,\nChiPFormer can exploit offline placement designs to learn transferable policies\nmore efficiently in a multi-task setting. Second, ChiPFormer can promote\neffective finetuning for unseen chip circuits, reducing the placement runtime\nfrom hours to minutes. Third, extensive experiments on 32 chip circuits\ndemonstrate that ChiPFormer achieves significantly better placement quality\nwhile reducing the runtime by 10x compared to recent state-of-the-art\napproaches in both public benchmarks and realistic industrial tasks. The\ndeliverables are released at https://sites.google.com/view/chipformer/home.\n","authors":["Yao Lai","Jinxin Liu","Zhentao Tang","Bin Wang","Jianye Hao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2306.14744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09787v4","updated":"2023-08-01T11:21:50Z","published":"2022-05-19T18:21:12Z","title":"Causal Discovery and Knowledge Injection for Contestable Neural Networks\n  (with Appendices)","summary":"  Neural networks have proven to be effective at solving machine learning tasks\nbut it is unclear whether they learn any relevant causal relationships, while\ntheir black-box nature makes it difficult for modellers to understand and debug\nthem. We propose a novel method overcoming these issues by allowing a two-way\ninteraction whereby neural-network-empowered machines can expose the\nunderpinning learnt causal graphs and humans can contest the machines by\nmodifying the causal graphs before re-injecting them into the machines. The\nlearnt models are guaranteed to conform to the graphs and adhere to expert\nknowledge, some of which can also be given up-front. By building a window into\nthe model behaviour and enabling knowledge injection, our method allows\npractitioners to debug networks based on the causal structure discovered from\nthe data and underpinning the predictions. Experiments with real and synthetic\ntabular data show that our method improves predictive performance up to 2.4x\nwhile producing parsimonious networks, up to 7x smaller in the input layer,\ncompared to SOTA regularised networks.\n","authors":["Fabrizio Russo","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2205.09787v4.pdf","comment":"Accepted at ECAI23 - Version with Appendices"},{"id":"http://arxiv.org/abs/2307.13055v2","updated":"2023-08-01T11:06:25Z","published":"2023-07-24T18:05:22Z","title":"MARIO: Model Agnostic Recipe for Improving OOD Generalization of Graph\n  Contrastive Learning","summary":"  In this work, we investigate the problem of out-of-distribution (OOD)\ngeneralization for unsupervised learning methods on graph data. This scenario\nis particularly challenging because graph neural networks (GNNs) have been\nshown to be sensitive to distributional shifts, even when labels are available.\nTo address this challenge, we propose a \\underline{M}odel-\\underline{A}gnostic\n\\underline{R}ecipe for \\underline{I}mproving \\underline{O}OD generalizability\nof unsupervised graph contrastive learning methods, which we refer to as MARIO.\nMARIO introduces two principles aimed at developing distributional-shift-robust\ngraph contrastive methods to overcome the limitations of existing frameworks:\n(i) Information Bottleneck (IB) principle for achieving generalizable\nrepresentations and (ii) Invariant principle that incorporates adversarial data\naugmentation to obtain invariant representations. To the best of our knowledge,\nthis is the first work that investigates the OOD generalization problem of\ngraph contrastive learning, with a specific focus on node-level tasks. Through\nextensive experiments, we demonstrate that our method achieves state-of-the-art\nperformance on the OOD test set, while maintaining comparable performance on\nthe in-distribution test set when compared to existing approaches. The source\ncode for our method can be found at: https://github.com/ZhuYun97/MARIO\n","authors":["Yun Zhu","Haizhou Shi","Zhenshuo Zhang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2307.13055v2.pdf","comment":"21 pages, 15 figures"},{"id":"http://arxiv.org/abs/2308.00452v1","updated":"2023-08-01T11:05:13Z","published":"2023-08-01T11:05:13Z","title":"A Majority Invariant Approach to Patch Robustness Certification for Deep\n  Learning Models","summary":"  Patch robustness certification ensures no patch within a given bound on a\nsample can manipulate a deep learning model to predict a different label.\nHowever, existing techniques cannot certify samples that cannot meet their\nstrict bars at the classifier or patch region levels. This paper proposes\nMajorCert. MajorCert firstly finds all possible label sets manipulatable by the\nsame patch region on the same sample across the underlying classifiers, then\nenumerates their combinations element-wise, and finally checks whether the\nmajority invariant of all these combinations is intact to certify samples.\n","authors":["Qilin Zhou","Zhengyuan Wei","Haipeng Wang","W. K. Chan"],"pdf_url":"https://arxiv.org/pdf/2308.00452v1.pdf","comment":"5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track"},{"id":"http://arxiv.org/abs/2307.13408v2","updated":"2023-08-01T10:46:20Z","published":"2023-07-25T11:07:43Z","title":"The Double-Edged Sword of Big Data and Information Technology for the\n  Disadvantaged: A Cautionary Tale from Open Banking","summary":"  This research article analyses and demonstrates the hidden implications for\nfairness of seemingly neutral data coupled with powerful technology, such as\nmachine learning (ML), using Open Banking as an example. Open Banking has\nignited a revolution in financial services, opening new opportunities for\ncustomer acquisition, management, retention, and risk assessment. However, the\ngranularity of transaction data holds potential for harm where unnoticed\nproxies for sensitive and prohibited characteristics may lead to indirect\ndiscrimination. Against this backdrop, we investigate the dimensions of\nfinancial vulnerability (FV), a global concern resulting from COVID-19 and\nrising inflation. Specifically, we look to understand the behavioral elements\nleading up to FV and its impact on at-risk, disadvantaged groups through the\nlens of fair interpretation. Using a unique dataset from a UK FinTech lender,\nwe demonstrate the power of fine-grained transaction data while simultaneously\ncautioning its safe usage. Three ML classifiers are compared in predicting the\nlikelihood of FV, and groups exhibiting different magnitudes and forms of FV\nare identified via clustering to highlight the effects of feature combination.\nOur results indicate that engineered features of financial behavior can be\npredictive of omitted personal information, particularly sensitive or protected\ncharacteristics, shedding light on the hidden dangers of Open Banking data. We\ndiscuss the implications and conclude fairness via unawareness is ineffective\nin this new technological environment.\n","authors":["Savina Dine Kim","Galina Andreeva","Michael Rovatsos"],"pdf_url":"https://arxiv.org/pdf/2307.13408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17020v2","updated":"2023-08-01T10:44:36Z","published":"2023-05-26T15:26:12Z","title":"Diable: Efficient Dialogue State Tracking as Operations on Tables","summary":"  Sequence-to-sequence state-of-the-art systems for dialogue state tracking\n(DST) use the full dialogue history as input, represent the current state as a\nlist with all the slots, and generate the entire state from scratch at each\ndialogue turn. This approach is inefficient, especially when the number of\nslots is large and the conversation is long. We propose Diable, a new task\nformalisation that simplifies the design and implementation of efficient DST\nsystems and allows one to easily plug and play large language models. We\nrepresent the dialogue state as a table and formalise DST as a table\nmanipulation task. At each turn, the system updates the previous state by\ngenerating table operations based on the dialogue context. Extensive\nexperimentation on the MultiWoz datasets demonstrates that Diable (i)\noutperforms strong efficient DST baselines, (ii) is 2.4x more time efficient\nthan current state-of-the-art methods while retaining competitive Joint Goal\nAccuracy, and (iii) is robust to noisy data annotations due to the table\noperations approach.\n","authors":["Pietro Lesci","Yoshinari Fujinuma","Momchil Hardalov","Chao Shang","Lluis Marquez"],"pdf_url":"https://arxiv.org/pdf/2305.17020v2.pdf","comment":"Accepted to ACL 2023 (Findings)"},{"id":"http://arxiv.org/abs/2212.11110v3","updated":"2023-08-01T10:43:21Z","published":"2022-12-21T15:49:20Z","title":"Lifelong Reinforcement Learning with Modulating Masks","summary":"  Lifelong learning aims to create AI systems that continuously and\nincrementally learn during a lifetime, similar to biological learning. Attempts\nso far have met problems, including catastrophic forgetting, interference among\ntasks, and the inability to exploit previous knowledge. While considerable\nresearch has focused on learning multiple supervised classification tasks that\ninvolve changes in the input distribution, lifelong reinforcement learning\n(LRL) must deal with variations in the state and transition distributions, and\nin the reward functions. Modulating masks with a fixed backbone network,\nrecently developed for classification, are particularly suitable to deal with\nsuch a large spectrum of task variations. In this paper, we adapted modulating\nmasks to work with deep LRL, specifically PPO and IMPALA agents. The comparison\nwith LRL baselines in both discrete and continuous RL tasks shows superior\nperformance. We further investigated the use of a linear combination of\npreviously learned masks to exploit previous knowledge when learning new tasks:\nnot only is learning faster, the algorithm solves tasks that we could not\notherwise solve from scratch due to extremely sparse rewards. The results\nsuggest that RL with modulating masks is a promising approach to lifelong\nlearning, to the composition of knowledge to learn increasingly complex tasks,\nand to knowledge reuse for efficient and faster learning.\n","authors":["Eseoghene Ben-Iwhiwhu","Saptarshi Nath","Praveen K. Pilly","Soheil Kolouri","Andrea Soltoggio"],"pdf_url":"https://arxiv.org/pdf/2212.11110v3.pdf","comment":"Code available at https://github.com/dlpbc/mask-lrl"},{"id":"http://arxiv.org/abs/2308.00436v1","updated":"2023-08-01T10:31:36Z","published":"2023-08-01T10:31:36Z","title":"SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step\n  Reasoning","summary":"  The recent progress in large language models (LLMs), especially the invention\nof chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning\nproblems. However, even the strongest LLMs are still struggling with more\ncomplicated problems that require non-linear thinking and multi-step reasoning.\nIn this work, we explore whether LLMs have the ability to recognize their own\nerrors, without resorting to external resources. In particular, we investigate\nwhether they can be used to identify individual errors within a step-by-step\nreasoning. To this end, we propose a zero-shot verification scheme to recognize\nsuch errors. We then use this verification scheme to improve question-answering\nperformance, by using it to perform weighted voting on different generated\nanswers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and\nfind that it successfully recognizes errors and, in turn, increases final\npredictive performance.\n","authors":["Ning Miao","Yee Whye Teh","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2308.00436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08698v2","updated":"2023-08-01T10:23:20Z","published":"2023-05-15T14:58:28Z","title":"Continual Multimodal Knowledge Graph Construction","summary":"  Multimodal Knowledge Graph Construction (MKGC) involves creating structured\nrepresentations of entities and relations using multiple modalities, such as\ntext and images. However, existing MKGC models face challenges in handling the\naddition of new entities and relations in dynamic real-world scenarios. The\ncurrent continual setting for knowledge graph construction mainly focuses on\nentity and relation extraction from text data, overlooking other multimodal\nsources. Therefore, there arises the need to explore the challenge of continual\nMKGC to address the phenomenon of catastrophic forgetting and ensure the\nretention of past knowledge extracted from different forms of data. This\nresearch focuses on investigating this complex topic by developing lifelong\nMKGC benchmark datasets. Based on the empirical findings that several typical\nMKGC models, when trained on multimedia data, might unexpectedly underperform\ncompared to those solely utilizing textual resources in a continual setting, we\npropose a Lifelong MultiModal Consistent Transformer Framework (LMC) for\ncontinual MKGC, which plays the strengths of the consistent multimodal\noptimization in continual learning and leads to a better stability-plasticity\ntrade-off. Our experiments demonstrate the superior performance of our method\nover prevailing continual learning techniques or multimodal approaches in\ndynamic scenarios. Code and datasets can be found at\nhttps://github.com/zjunlp/ContinueMKGC.\n","authors":["Xiang Chen","Ningyu Zhang","Jintian Zhang","Xiaohan Wang","Tongtong Wu","Xi Chen","Yongheng Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2305.08698v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2303.08268v2","updated":"2023-08-01T10:22:21Z","published":"2023-03-14T23:01:27Z","title":"Chat with the Environment: Interactive Multimodal Perception Using Large\n  Language Models","summary":"  Programming robot behavior in a complex world faces challenges on multiple\nlevels, from dextrous low-level skills to high-level planning and reasoning.\nRecent pre-trained Large Language Models (LLMs) have shown remarkable reasoning\nability in few-shot robotic planning. However, it remains challenging to ground\nLLMs in multimodal sensory input and continuous action output, while enabling a\nrobot to interact with its environment and acquire novel information as its\npolicies unfold. We develop a robot interaction scenario with a partially\nobservable state, which necessitates a robot to decide on a range of epistemic\nactions in order to sample sensory information among multiple modalities,\nbefore being able to execute the task correctly. An interactive perception\nframework is therefore proposed with an LLM as its backbone, whose ability is\nexploited to instruct epistemic actions and to reason over the resulting\nmultimodal sensations (vision, sound, haptics, proprioception), as well as to\nplan an entire task execution based on the interactively acquired information.\nOur study demonstrates that LLMs can provide high-level planning and reasoning\nskills and control interactive robot behavior in a multimodal environment,\nwhile multimodal modules with the context of the environmental state help\nground the LLMs and extend their processing ability. The project website can be\nfound at\n\\href{https://matcha-model.github.io}{\\textcolor{blue}{https://matcha-model.github.io/}}.\n","authors":["Xufeng Zhao","Mengdi Li","Cornelius Weber","Muhammad Burhan Hafez","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2303.08268v2.pdf","comment":"Accepted at IROS2023, Detroit. See the project website at\n  https://matcha-model.github.io"},{"id":"http://arxiv.org/abs/2305.05230v2","updated":"2023-08-01T10:18:08Z","published":"2023-05-09T07:45:55Z","title":"FedNoRo: Towards Noise-Robust Federated Learning by Addressing Class\n  Imbalance and Label Noise Heterogeneity","summary":"  Federated noisy label learning (FNLL) is emerging as a promising tool for\nprivacy-preserving multi-source decentralized learning. Existing research,\nrelying on the assumption of class-balanced global data, might be incapable to\nmodel complicated label noise, especially in medical scenarios. In this paper,\nwe first formulate a new and more realistic federated label noise problem where\nglobal data is class-imbalanced and label noise is heterogeneous, and then\npropose a two-stage framework named FedNoRo for noise-robust federated\nlearning. Specifically, in the first stage of FedNoRo, per-class loss\nindicators followed by Gaussian Mixture Model are deployed for noisy client\nidentification. In the second stage, knowledge distillation and a\ndistance-aware aggregation function are jointly adopted for noise-robust\nfederated model updating. Experimental results on the widely-used ICH and\nISIC2019 datasets demonstrate the superiority of FedNoRo against the\nstate-of-the-art FNLL methods for addressing class imbalance and label noise\nheterogeneity in real-world FL scenarios.\n","authors":["Nannan Wu","Li Yu","Xuefeng Jiang","Kwang-Ting Cheng","Zengqiang Yan"],"pdf_url":"https://arxiv.org/pdf/2305.05230v2.pdf","comment":"Accepted by IJCAI 2023 (Main Track)"},{"id":"http://arxiv.org/abs/2308.00399v1","updated":"2023-08-01T09:26:40Z","published":"2023-08-01T09:26:40Z","title":"Tackling Hallucinations in Neural Chart Summarization","summary":"  Hallucinations in text generation occur when the system produces text that is\nnot grounded in the input. In this work, we tackle the problem of\nhallucinations in neural chart summarization. Our analysis shows that the\ntarget side of chart summarization training datasets often contains additional\ninformation, leading to hallucinations. We propose a natural language inference\n(NLI) based method to preprocess the training data and show through human\nevaluation that our method significantly reduces hallucinations. We also found\nthat shortening long-distance dependencies in the input sequence and adding\nchart-related information like title and legends improves the overall\nperformance.\n","authors":["Saad Obaid ul Islam","Iza Škrjanec","Ondřej Dušek","Vera Demberg"],"pdf_url":"https://arxiv.org/pdf/2308.00399v1.pdf","comment":"To be presented in INLG 2023"},{"id":"http://arxiv.org/abs/2308.00393v1","updated":"2023-08-01T09:13:57Z","published":"2023-08-01T09:13:57Z","title":"A Survey of Time Series Anomaly Detection Methods in the AIOps Domain","summary":"  Internet-based services have seen remarkable success, generating vast amounts\nof monitored key performance indicators (KPIs) as univariate or multivariate\ntime series. Monitoring and analyzing these time series are crucial for\nresearchers, service operators, and on-call engineers to detect outliers or\nanomalies indicating service failures or significant events. Numerous advanced\nanomaly detection methods have emerged to address availability and performance\nissues. This review offers a comprehensive overview of time series anomaly\ndetection in Artificial Intelligence for IT operations (AIOps), which uses AI\ncapabilities to automate and optimize operational workflows. Additionally, it\nexplores future directions for real-world and next-generation time-series\nanomaly detection based on recent advancements.\n","authors":["Zhenyu Zhong","Qiliang Fan","Jiacheng Zhang","Minghua Ma","Shenglin Zhang","Yongqian Sun","Qingwei Lin","Yuzhi Zhang","Dan Pei"],"pdf_url":"https://arxiv.org/pdf/2308.00393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00391v1","updated":"2023-08-01T09:12:08Z","published":"2023-08-01T09:12:08Z","title":"Counterfactual Graph Transformer for Traffic Flow Prediction","summary":"  Traffic flow prediction (TFP) is a fundamental problem of the Intelligent\nTransportation System (ITS), as it models the latent spatial-temporal\ndependency of traffic flow for potential congestion prediction. Recent\ngraph-based models with multiple kinds of attention mechanisms have achieved\npromising performance. However, existing methods for traffic flow prediction\ntend to inherit the bias pattern from the dataset and lack interpretability. To\nthis end, we propose a Counterfactual Graph Transformer (CGT) model with an\ninstance-level explainer (e.g., finding the important subgraphs) specifically\ndesigned for TFP. We design a perturbation mask generator over input sensor\nfeatures at the time dimension and the graph structure on the graph transformer\nmodule to obtain spatial and temporal counterfactual explanations. By searching\nthe optimal perturbation masks on the input data feature and graph structures,\nwe can obtain the concise and dominant data or graph edge links for the\nsubsequent TFP task. After re-training the utilized graph transformer model\nafter counterfactual perturbation, we can obtain improved and interpretable\ntraffic flow prediction. Extensive results on three real-world public datasets\nshow that CGT can produce reliable explanations and is promising for traffic\nflow prediction.\n","authors":["Ying Yang","Kai Du","Xingyuan Dai","Jianwu Fang"],"pdf_url":"https://arxiv.org/pdf/2308.00391v1.pdf","comment":"accepted by ITSC 2023"},{"id":"http://arxiv.org/abs/2305.17446v2","updated":"2023-08-01T08:54:06Z","published":"2023-05-27T11:16:26Z","title":"Fine-tuning Happens in Tiny Subspaces: Exploring Intrinsic Task-specific\n  Subspaces of Pre-trained Language Models","summary":"  Pre-trained language models (PLMs) are known to be overly parameterized and\nhave significant redundancy, indicating a small degree of freedom of the PLMs.\nMotivated by the observation, in this paper, we study the problem of\nre-parameterizing and fine-tuning PLMs from a new perspective: Discovery of\nintrinsic task-specific subspace. Specifically, by exploiting the dynamics of\nthe fine-tuning process for a given task, the parameter optimization trajectory\nis learned to uncover its intrinsic task-specific subspace. A key finding is\nthat PLMs can be effectively fine-tuned in the subspace with a small number of\nfree parameters. Beyond, we observe some outlier dimensions emerging during\nfine-tuning in the subspace. Disabling these dimensions degrades the model\nperformance significantly. This suggests that these dimensions are crucial to\ninduce task-specific knowledge to downstream tasks.\n","authors":["Zhong Zhang","Bang Liu","Junming Shao"],"pdf_url":"https://arxiv.org/pdf/2305.17446v2.pdf","comment":"ACL 2023 (main conference, long paper)"},{"id":"http://arxiv.org/abs/2305.12073v2","updated":"2023-08-01T08:47:59Z","published":"2023-05-20T03:22:43Z","title":"GELU Activation Function in Deep Learning: A Comprehensive Mathematical\n  Analysis and Performance","summary":"  Selecting the most suitable activation function is a critical factor in the\neffectiveness of deep learning models, as it influences their learning\ncapacity, stability, and computational efficiency. In recent years, the\nGaussian Error Linear Unit (GELU) activation function has emerged as a dominant\nmethod, surpassing traditional functions such as the Rectified Linear Unit\n(ReLU) in various applications. This study presents a rigorous mathematical\ninvestigation of the GELU activation function, exploring its differentiability,\nboundedness, stationarity, and smoothness properties in detail. Additionally,\nwe conduct an extensive experimental comparison of the GELU function against a\nbroad range of alternative activation functions, utilizing a residual\nconvolutional network trained on the CIFAR-10, CIFAR-100, and STL-10 datasets\nas the empirical testbed. Our results demonstrate the superior performance of\nGELU compared to other activation functions, establishing its suitability for a\nwide range of deep learning applications. This comprehensive study contributes\nto a more profound understanding of the underlying mathematical properties of\nGELU and provides valuable insights for practitioners aiming to select\nactivation functions that optimally align with their specific objectives and\nconstraints in deep learning.\n","authors":["Minhyeok Lee"],"pdf_url":"https://arxiv.org/pdf/2305.12073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00377v1","updated":"2023-08-01T08:40:40Z","published":"2023-08-01T08:40:40Z","title":"Shape Completion with Prediction of Uncertain Regions","summary":"  Shape completion, i.e., predicting the complete geometry of an object from a\npartial observation, is highly relevant for several downstream tasks, most\nnotably robotic manipulation. When basing planning or prediction of real grasps\non object shape reconstruction, an indication of severe geometric uncertainty\nis indispensable. In particular, there can be an irreducible uncertainty in\nextended regions about the presence of entire object parts when given ambiguous\nobject views. To treat this important case, we propose two novel methods for\npredicting such uncertain regions as straightforward extensions of any method\nfor predicting local spatial occupancy, one through postprocessing occupancy\nscores, the other through direct prediction of an uncertainty indicator. We\ncompare these methods together with two known approaches to probabilistic shape\ncompletion. Moreover, we generate a dataset, derived from ShapeNet, of\nrealistically rendered depth images of object views with ground-truth\nannotations for the uncertain regions. We train on this dataset and test each\nmethod in shape completion and prediction of uncertain regions for known and\nnovel object instances and on synthetic and real data. While direct uncertainty\nprediction is by far the most accurate in the segmentation of uncertain\nregions, both novel methods outperform the two baselines in shape completion\nand uncertain region prediction, and avoiding the predicted uncertain regions\nincreases the quality of grasps for all tested methods. Web:\nhttps://github.com/DLR-RM/shape-completion\n","authors":["Matthias Humt","Dominik Winkelbauer","Ulrich Hillenbrand"],"pdf_url":"https://arxiv.org/pdf/2308.00377v1.pdf","comment":"7 pages, 5 figures, 2023 IEEE/RSJ International Conference on\n  Intelligent Robots and Systems, IROS 2023"},{"id":"http://arxiv.org/abs/2307.13430v2","updated":"2023-08-01T08:17:16Z","published":"2023-07-25T11:51:20Z","title":"Achieving Linear Speedup in Decentralized Stochastic Compositional\n  Minimax Optimization","summary":"  The stochastic compositional minimax problem has attracted a surge of\nattention in recent years since it covers many emerging machine learning\nmodels. Meanwhile, due to the emergence of distributed data, optimizing this\nkind of problem under the decentralized setting becomes badly needed. However,\nthe compositional structure in the loss function brings unique challenges to\ndesigning efficient decentralized optimization algorithms. In particular, our\nstudy shows that the standard gossip communication strategy cannot achieve\nlinear speedup for decentralized compositional minimax problems due to the\nlarge consensus error about the inner-level function. To address this issue, we\ndeveloped a novel decentralized stochastic compositional gradient descent\nascent with momentum algorithm to reduce the consensus error in the inner-level\nfunction. As such, our theoretical results demonstrate that it is able to\nachieve linear speedup with respect to the number of workers. We believe this\nnovel algorithmic design could benefit the development of decentralized\ncompositional optimization. Finally, we applied our methods to the imbalanced\nclassification problem. The extensive experimental results provide evidence for\nthe effectiveness of our algorithm.\n","authors":["Hongchang Gao"],"pdf_url":"https://arxiv.org/pdf/2307.13430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16459v2","updated":"2023-08-01T07:45:55Z","published":"2023-07-31T07:36:50Z","title":"L3DMC: Lifelong Learning using Distillation via Mixed-Curvature Space","summary":"  The performance of a lifelong learning (L3) model degrades when it is trained\non a series of tasks, as the geometrical formation of the embedding space\nchanges while learning novel concepts sequentially. The majority of existing L3\napproaches operate on a fixed-curvature (e.g., zero-curvature Euclidean) space\nthat is not necessarily suitable for modeling the complex geometric structure\nof data. Furthermore, the distillation strategies apply constraints directly on\nlow-dimensional embeddings, discouraging the L3 model from learning new\nconcepts by making the model highly stable. To address the problem, we propose\na distillation strategy named L3DMC that operates on mixed-curvature spaces to\npreserve the already-learned knowledge by modeling and maintaining complex\ngeometrical structures. We propose to embed the projected low dimensional\nembedding of fixed-curvature spaces (Euclidean and hyperbolic) to\nhigher-dimensional Reproducing Kernel Hilbert Space (RKHS) using a\npositive-definite kernel function to attain rich representation. Afterward, we\noptimize the L3 model by minimizing the discrepancies between the new sample\nrepresentation and the subspace constructed using the old representation in\nRKHS. L3DMC is capable of adapting new knowledge better without forgetting old\nknowledge as it combines the representation power of multiple fixed-curvature\nspaces and is performed on higher-dimensional RKHS. Thorough experiments on\nthree benchmarks demonstrate the effectiveness of our proposed distillation\nstrategy for medical image classification in L3 settings. Our code\nimplementation is publicly available at\nhttps://github.com/csiro-robotics/L3DMC.\n","authors":["Kaushik Roy","Peyman Moghadam","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2307.16459v2.pdf","comment":"MICCAI 2023 (Early Accept)"},{"id":"http://arxiv.org/abs/2308.00350v1","updated":"2023-08-01T07:43:46Z","published":"2023-08-01T07:43:46Z","title":"Learning Green's Function Efficiently Using Low-Rank Approximations","summary":"  Learning the Green's function using deep learning models enables to solve\ndifferent classes of partial differential equations. A practical limitation of\nusing deep learning for the Green's function is the repeated computationally\nexpensive Monte-Carlo integral approximations. We propose to learn the Green's\nfunction by low-rank decomposition, which results in a novel architecture to\nremove redundant computations by separate learning with domain data for\nevaluation and Monte-Carlo samples for integral approximation. Using\nexperiments we show that the proposed method improves computational time\ncompared to MOD-Net while achieving comparable accuracy compared to both PINNs\nand MOD-Net.\n","authors":["Kishan Wimalawarne","Taiji Suzuki","Sophie Langer"],"pdf_url":"https://arxiv.org/pdf/2308.00350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.12494v2","updated":"2023-08-01T07:43:34Z","published":"2022-10-22T16:36:41Z","title":"On the Generalized Likelihood Ratio Test and One-Class Classifiers","summary":"  One-class classification (OCC) is the problem of deciding whether an observed\nsample belongs to a target class. We consider the problem of learning an OCC\nmodel that performs as the generalized likelihood ratio test (GLRT), given a\ndataset containing samples of the target class. The GLRT solves the same\nproblem when the statistics of the target class are available. The GLRT is a\nwell-known and provably optimal (under specific assumptions) classifier. To\nthis end, we consider both the multilayer perceptron neural network (NN) and\nthe support vector machine (SVM) models. They are trained as two-class\nclassifiers using an artificial dataset for the alternative class, obtained by\ngenerating random samples, uniformly over the domain of the target-class\ndataset. We prove that, under suitable assumptions, the models converge (with a\nlarge dataset) to the GLRT. Moreover, we show that the one-class least squares\nSVM (OCLSSVM) with suitable kernels at convergence performs as the GLRT.\nLastly, we prove that the widely used autoencoder (AE) classifier does not\ngenerally provide the GLRT.\n","authors":["Francesco Ardizzon","Stefano Tomasin"],"pdf_url":"https://arxiv.org/pdf/2210.12494v2.pdf","comment":"12 pages, 6 figure, submitted to IEEE Transactions on Signal\n  Processing"},{"id":"http://arxiv.org/abs/2308.00346v1","updated":"2023-08-01T07:41:41Z","published":"2023-08-01T07:41:41Z","title":"Dynamic ensemble selection based on Deep Neural Network Uncertainty\n  Estimation for Adversarial Robustness","summary":"  The deep neural network has attained significant efficiency in image\nrecognition. However, it has vulnerable recognition robustness under extensive\ndata uncertainty in practical applications. The uncertainty is attributed to\nthe inevitable ambient noise and, more importantly, the possible adversarial\nattack. Dynamic methods can effectively improve the defense initiative in the\narms race of attack and defense of adversarial examples. Different from the\nprevious dynamic method depend on input or decision, this work explore the\ndynamic attributes in model level through dynamic ensemble selection technology\nto further protect the model from white-box attacks and improve the robustness.\nSpecifically, in training phase the Dirichlet distribution is apply as prior of\nsub-models' predictive distribution, and the diversity constraint in parameter\nspace is introduced under the lightweight sub-models to construct alternative\nensembel model spaces. In test phase, the certain sub-models are dynamically\nselected based on their rank of uncertainty value for the final prediction to\nensure the majority accurate principle in ensemble robustness and accuracy.\nCompared with the previous dynamic method and staic adversarial traning model,\nthe presented approach can achieve significant robustness results without\ndamaging accuracy by combining dynamics and diversity property.\n","authors":["Ruoxi Qin","Linyuan Wang","Xuehui Du","Xingyuan Chen","Bin Yan"],"pdf_url":"https://arxiv.org/pdf/2308.00346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00341v1","updated":"2023-08-01T07:35:54Z","published":"2023-08-01T07:35:54Z","title":"Monitoring Algorithmic Fairness under Partial Observations","summary":"  As AI and machine-learned software are used increasingly for making decisions\nthat affect humans, it is imperative that they remain fair and unbiased in\ntheir decisions. To complement design-time bias mitigation measures, runtime\nverification techniques have been introduced recently to monitor the\nalgorithmic fairness of deployed systems. Previous monitoring techniques assume\nfull observability of the states of the (unknown) monitored system. Moreover,\nthey can monitor only fairness properties that are specified as arithmetic\nexpressions over the probabilities of different events. In this work, we extend\nfairness monitoring to systems modeled as partially observed Markov chains\n(POMC), and to specifications containing arithmetic expressions over the\nexpected values of numerical functions on event sequences. The only assumptions\nwe make are that the underlying POMC is aperiodic and starts in the stationary\ndistribution, with a bound on its mixing time being known. These assumptions\nenable us to estimate a given property for the entire distribution of possible\nexecutions of the monitored POMC, by observing only a single execution. Our\nmonitors observe a long run of the system and, after each new observation,\noutput updated PAC-estimates of how fair or biased the system is. The monitors\nare computationally lightweight and, using a prototype implementation, we\ndemonstrate their effectiveness on several real-world examples.\n","authors":["Thomas A. Henzinger","Konstantin Kueffner","Kaushik Mallik"],"pdf_url":"https://arxiv.org/pdf/2308.00341v1.pdf","comment":"The extended version of the paper, with the same title, published in\n  23rd International Conference on Runtime Verification (RV'23)"},{"id":"http://arxiv.org/abs/2212.10426v6","updated":"2023-08-01T07:08:36Z","published":"2022-12-20T17:04:50Z","title":"Deep Riemannian Networks for EEG Decoding","summary":"  State-of-the-art performance in electroencephalography (EEG) decoding tasks\nis currently often achieved with either Deep-Learning (DL) or\nRiemannian-Geometry-based decoders (RBDs). Recently, there is growing interest\nin Deep Riemannian Networks (DRNs) possibly combining the advantages of both\nprevious classes of methods. However, there are still a range of topics where\nadditional insight is needed to pave the way for a more widespread application\nof DRNs in EEG. These include architecture design questions such as network\nsize and end-to-end ability.How these factors affect model performance has not\nbeen explored. Additionally, it is not clear how the data within these networks\nis transformed, and whether this would correlate with traditional EEG decoding.\nOur study aims to lay the groundwork in the area of these topics through the\nanalysis of DRNs for EEG with a wide range of hyperparameters. Networks were\ntested on two public EEG datasets and compared with state-of-the-art ConvNets.\nHere we propose end-to-end EEG SPDNet (EE(G)-SPDNet), and we show that this\nwide, end-to-end DRN can outperform the ConvNets, and in doing so use\nphysiologically plausible frequency regions. We also show that the end-to-end\napproach learns more complex filters than traditional band-pass filters\ntargeting the classical alpha, beta, and gamma frequency bands of the EEG, and\nthat performance can benefit from channel specific filtering approaches.\nAdditionally, architectural analysis revealed areas for further improvement due\nto the possible loss of Riemannian specific information throughout the network.\nOur study thus shows how to design and train DRNs to infer task-related\ninformation from the raw EEG without the need of handcrafted filterbanks and\nhighlights the potential of end-to-end DRNs such as EE(G)-SPDNet for\nhigh-performance EEG decoding.\n","authors":["Daniel Wilson","Robin Tibor Schirrmeister","Lukas Alexander Wilhelm Gemein","Tonio Ball"],"pdf_url":"https://arxiv.org/pdf/2212.10426v6.pdf","comment":"27 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2307.10869v2","updated":"2023-08-01T07:04:29Z","published":"2023-07-20T13:41:26Z","title":"Performance Issue Identification in Cloud Systems with\n  Relational-Temporal Anomaly Detection","summary":"  Performance issues permeate large-scale cloud service systems, which can lead\nto huge revenue losses. To ensure reliable performance, it's essential to\naccurately identify and localize these issues using service monitoring metrics.\nGiven the complexity and scale of modern cloud systems, this task can be\nchallenging and may require extensive expertise and resources beyond the\ncapacity of individual humans. Some existing methods tackle this problem by\nanalyzing each metric independently to detect anomalies. However, this could\nincur overwhelming alert storms that are difficult for engineers to diagnose\nmanually. To pursue better performance, not only the temporal patterns of\nmetrics but also the correlation between metrics (i.e., relational patterns)\nshould be considered, which can be formulated as a multivariate metrics anomaly\ndetection problem. However, most of the studies fall short of extracting these\ntwo types of features explicitly. Moreover, there exist some unlabeled\nanomalies mixed in the training data, which may hinder the detection\nperformance. To address these limitations, we propose the Relational- Temporal\nAnomaly Detection Model (RTAnomaly) that combines the relational and temporal\ninformation of metrics. RTAnomaly employs a graph attention layer to learn the\ndependencies among metrics, which will further help pinpoint the anomalous\nmetrics that may cause the anomaly effectively. In addition, we exploit the\nconcept of positive unlabeled learning to address the issue of potential\nanomalies in the training data. To evaluate our method, we conduct experiments\non a public dataset and two industrial datasets. RTAnomaly outperforms all the\nbaseline models by achieving an average F1 score of 0.929 and Hit@3 of 0.920,\ndemonstrating its superiority.\n","authors":["Wenwei Gu","Jinyang Liu","Zhuangbin Chen","Jianping Zhang","Yuxin Su","Jiazhen Gu","Cong Feng","Zengyin Yang","Michael Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.10869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00327v1","updated":"2023-08-01T07:03:16Z","published":"2023-08-01T07:03:16Z","title":"Threshold-aware Learning to Generate Feasible Solutions for Mixed\n  Integer Programs","summary":"  Finding a high-quality feasible solution to a combinatorial optimization (CO)\nproblem in a limited time is challenging due to its discrete nature. Recently,\nthere has been an increasing number of machine learning (ML) methods for\naddressing CO problems. Neural diving (ND) is one of the learning-based\napproaches to generating partial discrete variable assignments in Mixed Integer\nPrograms (MIP), a framework for modeling CO problems. However, a major drawback\nof ND is a large discrepancy between the ML and MIP objectives, i.e., variable\nvalue classification accuracy over primal bound. Our study investigates that a\nspecific range of variable assignment rates (coverage) yields high-quality\nfeasible solutions, where we suggest optimizing the coverage bridges the gap\nbetween the learning and MIP objectives. Consequently, we introduce a post-hoc\nmethod and a learning-based approach for optimizing the coverage. A key idea of\nour approach is to jointly learn to restrict the coverage search space and to\npredict the coverage in the learned search space. Experimental results\ndemonstrate that learning a deep neural network to estimate the coverage for\nfinding high-quality feasible solutions achieves state-of-the-art performance\nin NeurIPS ML4CO datasets. In particular, our method shows outstanding\nperformance in the workload apportionment dataset, achieving the optimality gap\nof 0.45%, a ten-fold improvement over SCIP within the one-minute time limit.\n","authors":["Taehyun Yoon","Jinwon Choi","Hyokun Yun","Sungbin Lim"],"pdf_url":"https://arxiv.org/pdf/2308.00327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.00574v4","updated":"2023-08-01T06:52:13Z","published":"2022-02-01T17:20:47Z","title":"Identifying Pauli spin blockade using deep learning","summary":"  Pauli spin blockade (PSB) can be employed as a great resource for spin qubit\ninitialisation and readout even at elevated temperatures but it can be\ndifficult to identify. We present a machine learning algorithm capable of\nautomatically identifying PSB using charge transport measurements. The scarcity\nof PSB data is circumvented by training the algorithm with simulated data and\nby using cross-device validation. We demonstrate our approach on a silicon\nfield-effect transistor device and report an accuracy of 96% on different test\ndevices, giving evidence that the approach is robust to device variability. The\napproach is expected to be employable across all types of quantum dot devices.\n","authors":["Jonas Schuff","Dominic T. Lennon","Simon Geyer","David L. Craig","Federico Fedele","Florian Vigneau","Leon C. Camenzind","Andreas V. Kuhlmann","G. Andrew D. Briggs","Dominik M. Zumbühl","Dino Sejdinovic","Natalia Ares"],"pdf_url":"https://arxiv.org/pdf/2202.00574v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09866v2","updated":"2023-08-01T06:48:51Z","published":"2023-07-19T09:53:56Z","title":"Detecting Vulnerable Nodes in Urban Infrastructure Interdependent\n  Network","summary":"  Understanding and characterizing the vulnerability of urban infrastructures,\nwhich refers to the engineering facilities essential for the regular running of\ncities and that exist naturally in the form of networks, is of great value to\nus. Potential applications include protecting fragile facilities and designing\nrobust topologies, etc. Due to the strong correlation between different\ntopological characteristics and infrastructure vulnerability and their\ncomplicated evolution mechanisms, some heuristic and machine-assisted analysis\nfall short in addressing such a scenario. In this paper, we model the\ninterdependent network as a heterogeneous graph and propose a system based on\ngraph neural network with reinforcement learning, which can be trained on\nreal-world data, to characterize the vulnerability of the city system\naccurately. The presented system leverages deep learning techniques to\nunderstand and analyze the heterogeneous graph, which enables us to capture the\nrisk of cascade failure and discover vulnerable infrastructures of cities.\nExtensive experiments with various requests demonstrate not only the expressive\npower of our system but also transferring ability and necessity of the specific\ncomponents.\n","authors":["Jinzhu Mao","Liu Cao","Chen Gao","Huandong Wang","Hangyu Fan","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2307.09866v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16419v2","updated":"2023-08-01T06:45:22Z","published":"2023-07-31T05:59:09Z","title":"Subspace Distillation for Continual Learning","summary":"  An ultimate objective in continual learning is to preserve knowledge learned\nin preceding tasks while learning new tasks. To mitigate forgetting prior\nknowledge, we propose a novel knowledge distillation technique that takes into\nthe account the manifold structure of the latent/output space of a neural\nnetwork in learning novel tasks. To achieve this, we propose to approximate the\ndata manifold up-to its first order, hence benefiting from linear subspaces to\nmodel the structure and maintain the knowledge of a neural network while\nlearning novel concepts. We demonstrate that the modeling with subspaces\nprovides several intriguing properties, including robustness to noise and\ntherefore effective for mitigating Catastrophic Forgetting in continual\nlearning. We also discuss and show how our proposed method can be adopted to\naddress both classification and segmentation problems. Empirically, we observe\nthat our proposed method outperforms various continual learning methods on\nseveral challenging datasets including Pascal VOC, and Tiny-Imagenet.\nFurthermore, we show how the proposed method can be seamlessly combined with\nexisting learning approaches to improve their performances. The codes of this\narticle will be available at https://github.com/csiro-robotics/SDCL.\n","authors":["Kaushik Roy","Christian Simon","Peyman Moghadam","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2307.16419v2.pdf","comment":"Neural Networks (submitted May 2022, accepted July 2023)"},{"id":"http://arxiv.org/abs/2308.00318v1","updated":"2023-08-01T06:29:33Z","published":"2023-08-01T06:29:33Z","title":"Pixel to policy: DQN Encoders for within & cross-game reinforcement\n  learning","summary":"  Reinforcement Learning can be applied to various tasks, and environments.\nMany of these environments have a similar shared structure, which can be\nexploited to improve RL performance on other tasks. Transfer learning can be\nused to take advantage of this shared structure, by learning policies that are\ntransferable across different tasks and environments and can lead to more\nefficient learning as well as improved performance on a wide range of tasks.\nThis work explores as well as compares the performance between RL models being\ntrained from the scratch and on different approaches of transfer learning.\nAdditionally, the study explores the performance of a model trained on multiple\ngame environments, with the goal of developing a universal game-playing agent\nas well as transfer learning a pre-trained encoder using DQN, and training it\non the same game or a different game. Our DQN model achieves a mean episode\nreward of 46.16 which even beats the human-level performance with merely 20k\nepisodes which is significantly lower than deepmind's 1M episodes. The achieved\nmean rewards of 533.42 and 402.17 on the Assault and Space Invader environments\nrespectively, represent noteworthy performance on these challenging\nenvironments.\n","authors":["Ashrya Agrawal","Priyanshi Shah","Sourabh Prakash"],"pdf_url":"https://arxiv.org/pdf/2308.00318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08812v2","updated":"2023-08-01T06:16:19Z","published":"2023-03-15T17:59:01Z","title":"Trigger-Level Event Reconstruction for Neutrino Telescopes Using Sparse\n  Submanifold Convolutional Neural Networks","summary":"  Convolutional neural networks (CNNs) have seen extensive applications in\nscientific data analysis, including in neutrino telescopes. However, the data\nfrom these experiments present numerous challenges to CNNs, such as non-regular\ngeometry, sparsity, and high dimensionality. Consequently, CNNs are highly\ninefficient on neutrino telescope data, and require significant pre-processing\nthat results in information loss. We propose sparse submanifold convolutions\n(SSCNNs) as a solution to these issues and show that the SSCNN event\nreconstruction performance is comparable to or better than traditional and\nmachine learning algorithms. Additionally, our SSCNN runs approximately 16\ntimes faster than a traditional CNN on a GPU. As a result of this speedup, it\nis expected to be capable of handling the trigger-level event rate of\nIceCube-scale neutrino telescopes. These networks could be used to improve the\nfirst estimation of the neutrino energy and direction to seed more advanced\nreconstructions, or to provide this information to an alert-sending system to\nquickly follow-up interesting events.\n","authors":["Felix J. Yu","Jeffrey Lazar","Carlos A. Argüelles"],"pdf_url":"https://arxiv.org/pdf/2303.08812v2.pdf","comment":"7 pages, 6 figures; changes to training process, adjusted figures,\n  added text"},{"id":"http://arxiv.org/abs/2308.00311v1","updated":"2023-08-01T06:16:18Z","published":"2023-08-01T06:16:18Z","title":"Doubly Robust Instance-Reweighted Adversarial Training","summary":"  Assigning importance weights to adversarial data has achieved great success\nin training adversarially robust networks under limited model capacity.\nHowever, existing instance-reweighted adversarial training (AT) methods heavily\ndepend on heuristics and/or geometric interpretations to determine those\nimportance weights, making these algorithms lack rigorous theoretical\njustification/guarantee. Moreover, recent research has shown that adversarial\ntraining suffers from a severe non-uniform robust performance across the\ntraining distribution, e.g., data points belonging to some classes can be much\nmore vulnerable to adversarial attacks than others. To address both issues, in\nthis paper, we propose a novel doubly-robust instance reweighted AT framework,\nwhich allows to obtain the importance weights via exploring distributionally\nrobust optimization (DRO) techniques, and at the same time boosts the\nrobustness on the most vulnerable examples. In particular, our importance\nweights are obtained by optimizing the KL-divergence regularized loss function,\nwhich allows us to devise new algorithms with a theoretical convergence\nguarantee. Experiments on standard classification datasets demonstrate that our\nproposed approach outperforms related state-of-the-art baseline methods in\nterms of average robust performance, and at the same time improves the\nrobustness against attacks on the weakest data points. Codes will be available\nsoon.\n","authors":["Daouda Sow","Sen Lin","Zhangyang Wang","Yingbin Liang"],"pdf_url":"https://arxiv.org/pdf/2308.00311v1.pdf","comment":"Submitted for publication"},{"id":"http://arxiv.org/abs/2308.00310v1","updated":"2023-08-01T06:12:12Z","published":"2023-08-01T06:12:12Z","title":"GradOrth: A Simple yet Efficient Out-of-Distribution Detection with\n  Orthogonal Projection of Gradients","summary":"  Detecting out-of-distribution (OOD) data is crucial for ensuring the safe\ndeployment of machine learning models in real-world applications. However,\nexisting OOD detection approaches primarily rely on the feature maps or the\nfull gradient space information to derive OOD scores neglecting the role of\nmost important parameters of the pre-trained network over in-distribution (ID)\ndata. In this study, we propose a novel approach called GradOrth to facilitate\nOOD detection based on one intriguing observation that the important features\nto identify OOD data lie in the lower-rank subspace of in-distribution (ID)\ndata. In particular, we identify OOD data by computing the norm of gradient\nprojection on the subspaces considered important for the in-distribution data.\nA large orthogonal projection value (i.e. a small projection value) indicates\nthe sample as OOD as it captures a weak correlation of the ID data. This simple\nyet effective method exhibits outstanding performance, showcasing a notable\nreduction in the average false positive rate at a 95% true positive rate\n(FPR95) of up to 8% when compared to the current state-of-the-art methods.\n","authors":["Sima Behpour","Thang Doan","Xin Li","Wenbin He","Liang Gou","Liu Ren"],"pdf_url":"https://arxiv.org/pdf/2308.00310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14387v2","updated":"2023-08-01T05:46:21Z","published":"2023-05-22T17:55:50Z","title":"AlpacaFarm: A Simulation Framework for Methods that Learn from Human\n  Feedback","summary":"  Large language models (LLMs) such as ChatGPT have seen widespread adoption\ndue to their ability to follow user instructions well. Developing these LLMs\ninvolves a complex yet poorly understood workflow requiring training with human\nfeedback. Replicating and understanding this instruction-following process\nfaces three major challenges: the high cost of data collection, the lack of\ntrustworthy evaluation, and the absence of reference method implementations. We\naddress these challenges with AlpacaFarm, a simulator that enables research and\ndevelopment for learning from feedback at a low cost. First, we design LLM\nprompts to simulate human feedback that are 45x cheaper than crowdworkers and\ndisplay high agreement with humans. Second, we propose an automatic evaluation\nand validate it against human instructions obtained on real-world interactions.\nThird, we contribute reference implementations for several methods (PPO,\nbest-of-n, expert iteration, and more) that learn from pairwise feedback.\nFinally, as an end-to-end validation of AlpacaFarm, we train and evaluate\neleven models on 10k pairs of real human feedback and show that rankings of\nmodels trained in AlpacaFarm match rankings of models trained on human data. As\na demonstration of the research possible in AlpacaFarm, we find that methods\nthat use a reward model can substantially improve over supervised fine-tuning\nand that our reference PPO implementation leads to a +10% improvement in\nwin-rate against Davinci003. We release all components of AlpacaFarm at\nhttps://github.com/tatsu-lab/alpaca_farm.\n","authors":["Yann Dubois","Xuechen Li","Rohan Taori","Tianyi Zhang","Ishaan Gulrajani","Jimmy Ba","Carlos Guestrin","Percy Liang","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2305.14387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16210v2","updated":"2023-08-01T05:35:51Z","published":"2023-07-30T12:16:49Z","title":"Rethinking Uncertainly Missing and Ambiguous Visual Modality in\n  Multi-Modal Entity Alignment","summary":"  As a crucial extension of entity alignment (EA), multi-modal entity alignment\n(MMEA) aims to identify identical entities across disparate knowledge graphs\n(KGs) by exploiting associated visual information. However, existing MMEA\napproaches primarily concentrate on the fusion paradigm of multi-modal entity\nfeatures, while neglecting the challenges presented by the pervasive phenomenon\nof missing and intrinsic ambiguity of visual images. In this paper, we present\na further analysis of visual modality incompleteness, benchmarking latest MMEA\nmodels on our proposed dataset MMEA-UMVM, where the types of alignment KGs\ncovering bilingual and monolingual, with standard (non-iterative) and iterative\ntraining paradigms to evaluate the model performance. Our research indicates\nthat, in the face of modality incompleteness, models succumb to overfitting the\nmodality noise, and exhibit performance oscillations or declines at high rates\nof missing modality. This proves that the inclusion of additional multi-modal\ndata can sometimes adversely affect EA. To address these challenges, we\nintroduce UMAEA , a robust multi-modal entity alignment approach designed to\ntackle uncertainly missing and ambiguous visual modalities. It consistently\nachieves SOTA performance across all 97 benchmark splits, significantly\nsurpassing existing baselines with limited parameters and time consumption,\nwhile effectively alleviating the identified limitations of other models. Our\ncode and benchmark data are available at https://github.com/zjukg/UMAEA.\n","authors":["Zhuo Chen","Lingbing Guo","Yin Fang","Yichi Zhang","Jiaoyan Chen","Jeff Z. Pan","Yangning Li","Huajun Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.16210v2.pdf","comment":"International Semantic Web Conference '23 (ISWC 2023),\n  https://github.com/zjukg/UMAEA"},{"id":"http://arxiv.org/abs/2203.03673v4","updated":"2023-08-01T05:01:34Z","published":"2022-03-07T19:12:40Z","title":"AgraSSt: Approximate Graph Stein Statistics for Interpretable Assessment\n  of Implicit Graph Generators","summary":"  We propose and analyse a novel statistical procedure, coined AgraSSt, to\nassess the quality of graph generators that may not be available in explicit\nform. In particular, AgraSSt can be used to determine whether a learnt graph\ngenerating process is capable of generating graphs that resemble a given input\ngraph. Inspired by Stein operators for random graphs, the key idea of AgraSSt\nis the construction of a kernel discrepancy based on an operator obtained from\nthe graph generator. AgraSSt can provide interpretable criticisms for a graph\ngenerator training procedure and help identify reliable sample batches for\ndownstream tasks. Using Stein`s method we give theoretical guarantees for a\nbroad class of random graph models. We provide empirical results on both\nsynthetic input graphs with known graph generation procedures, and real-world\ninput graphs that the state-of-the-art (deep) generative models for graphs are\ntrained on.\n","authors":["Wenkai Xu","Gesine Reinert"],"pdf_url":"https://arxiv.org/pdf/2203.03673v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00287v1","updated":"2023-08-01T05:01:05Z","published":"2023-08-01T05:01:05Z","title":"A Study of Unsupervised Evaluation Metrics for Practical and Automatic\n  Domain Adaptation","summary":"  Unsupervised domain adaptation (UDA) methods facilitate the transfer of\nmodels to target domains without labels. However, these methods necessitate a\nlabeled target validation set for hyper-parameter tuning and model selection.\nIn this paper, we aim to find an evaluation metric capable of assessing the\nquality of a transferred model without access to target validation labels. We\nbegin with the metric based on mutual information of the model prediction.\nThrough empirical analysis, we identify three prevalent issues with this\nmetric: 1) It does not account for the source structure. 2) It can be easily\nattacked. 3) It fails to detect negative transfer caused by the over-alignment\nof source and target features. To address the first two issues, we incorporate\nsource accuracy into the metric and employ a new MLP classifier that is held\nout during training, significantly improving the result. To tackle the final\nissue, we integrate this enhanced metric with data augmentation, resulting in a\nnovel unsupervised UDA metric called the Augmentation Consistency Metric (ACM).\nAdditionally, we empirically demonstrate the shortcomings of previous\nexperiment settings and conduct large-scale experiments to validate the\neffectiveness of our proposed metric. Furthermore, we employ our metric to\nautomatically search for the optimal hyper-parameter set, achieving superior\nperformance compared to manually tuned sets across four common benchmarks.\nCodes will be available soon.\n","authors":["Minghao Chen","Zepeng Gao","Shuai Zhao","Qibo Qiu","Wenxiao Wang","Binbin Lin","Xiaofei He"],"pdf_url":"https://arxiv.org/pdf/2308.00287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00285v1","updated":"2023-08-01T04:46:58Z","published":"2023-08-01T04:46:58Z","title":"Predictive Modeling through Hyper-Bayesian Optimization","summary":"  Model selection is an integral problem of model based optimization techniques\nsuch as Bayesian optimization (BO). Current approaches often treat model\nselection as an estimation problem, to be periodically updated with\nobservations coming from the optimization iterations. In this paper, we propose\nan alternative way to achieve both efficiently. Specifically, we propose a\nnovel way of integrating model selection and BO for the single goal of reaching\nthe function optima faster. The algorithm moves back and forth between BO in\nthe model space and BO in the function space, where the goodness of the\nrecommended model is captured by a score function and fed back, capturing how\nwell the model helped convergence in the function space. The score function is\nderived in such a way that it neutralizes the effect of the moving nature of\nthe BO in the function space, thus keeping the model selection problem\nstationary. This back and forth leads to quick convergence for both model\nselection and BO in the function space. In addition to improved sample\nefficiency, the framework outputs information about the black-box function.\nConvergence is proved, and experimental results show significant improvement\ncompared to standard BO.\n","authors":["Manisha Senadeera","Santu Rana","Sunil Gupta","Svetha Venkatesh"],"pdf_url":"https://arxiv.org/pdf/2308.00285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00284v1","updated":"2023-08-01T04:46:35Z","published":"2023-08-01T04:46:35Z","title":"CLAMS: A Cluster Ambiguity Measure for Estimating Perceptual Variability\n  in Visual Clustering","summary":"  Visual clustering is a common perceptual task in scatterplots that supports\ndiverse analytics tasks (e.g., cluster identification). However, even with the\nsame scatterplot, the ways of perceiving clusters (i.e., conducting visual\nclustering) can differ due to the differences among individuals and ambiguous\ncluster boundaries. Although such perceptual variability casts doubt on the\nreliability of data analysis based on visual clustering, we lack a systematic\nway to efficiently assess this variability. In this research, we study\nperceptual variability in conducting visual clustering, which we call Cluster\nAmbiguity. To this end, we introduce CLAMS, a data-driven visual quality\nmeasure for automatically predicting cluster ambiguity in monochrome\nscatterplots. We first conduct a qualitative study to identify key factors that\naffect the visual separation of clusters (e.g., proximity or size difference\nbetween clusters). Based on study findings, we deploy a regression module that\nestimates the human-judged separability of two clusters. Then, CLAMS predicts\ncluster ambiguity by analyzing the aggregated results of all pairwise\nseparability between clusters that are generated by the module. CLAMS\noutperforms widely-used clustering techniques in predicting ground truth\ncluster ambiguity. Meanwhile, CLAMS exhibits performance on par with human\nannotators. We conclude our work by presenting two applications for optimizing\nand benchmarking data mining techniques using CLAMS. The interactive demo of\nCLAMS is available at clusterambiguity.dev.\n","authors":["Hyeon Jeon","Ghulam Jilani Quadri","Hyunwook Lee","Paul Rosen","Danielle Albers Szafir","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00284v1.pdf","comment":"IEEE Transactions on Visualization and Computer Graphics (TVCG)\n  (Proc. IEEE VIS 2023); equally contributed by Hyeon Jeon and Ghulam Jilani\n  Quadri"},{"id":"http://arxiv.org/abs/2308.00282v1","updated":"2023-08-01T04:38:15Z","published":"2023-08-01T04:38:15Z","title":"ZADU: A Python Library for Evaluating the Reliability of Dimensionality\n  Reduction Embeddings","summary":"  Dimensionality reduction (DR) techniques inherently distort the original\nstructure of input high-dimensional data, producing imperfect low-dimensional\nembeddings. Diverse distortion measures have thus been proposed to evaluate the\nreliability of DR embeddings. However, implementing and executing distortion\nmeasures in practice has so far been time-consuming and tedious. To address\nthis issue, we present ZADU, a Python library that provides distortion\nmeasures. ZADU is not only easy to install and execute but also enables\ncomprehensive evaluation of DR embeddings through three key features. First,\nthe library covers a wide range of distortion measures. Second, it\nautomatically optimizes the execution of distortion measures, substantially\nreducing the running time required to execute multiple measures. Last, the\nlibrary informs how individual points contribute to the overall distortions,\nfacilitating the detailed analysis of DR embeddings. By simulating a real-world\nscenario of optimizing DR embeddings, we verify that our optimization scheme\nsubstantially reduces the time required to execute distortion measures.\nFinally, as an application of ZADU, we present another library called ZADUVis\nthat allows users to easily create distortion visualizations that depict the\nextent to which each region of an embedding suffers from distortions.\n","authors":["Hyeon Jeon","Aeri Cho","Jinhwa Jang","Soohyun Lee","Jake Hyun","Hyung-Kwon Ko","Jaemin Jo","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00282v1.pdf","comment":"2023 IEEE Visualization and Visual Analytics (IEEE VIS 2023) Short\n  paper"},{"id":"http://arxiv.org/abs/2301.12923v2","updated":"2023-08-01T04:38:09Z","published":"2023-01-30T14:25:02Z","title":"On student-teacher deviations in distillation: does it pay to disobey?","summary":"  Knowledge distillation (KD) has been widely-used to improve the test accuracy\nof a ``student'' network by training the student to mimic soft probabilities of\na trained \"teacher\" network. Yet, it has been shown in recent work that,\ndespite being trained to fit the teacher's probabilities, the student not only\nsignificantly deviates from these probabilities, but also performs even better\nthan the teacher. Our work aims to reconcile this seemingly paradoxical\nobservation by characterizing the precise nature of the student-teacher\ndeviations, and by arguing how they can co-occur with better generalization.\nFirst, through experiments on image and language data, we identify that these\ndeviations correspond to the student systematically exaggerating the confidence\nlevels of the teacher. Next, we theoretically and empirically establish in some\nsimple settings that KD also exaggerates the implicit bias of gradient descent\nin converging faster along the top eigendirections of the data. Finally, we\ndemonstrate that this exaggerated bias effect can simultaneously result in both\n(a) the exaggeration of confidence and (b) the improved generalization of the\nstudent, thus offering a resolution to the apparent paradox. Our analysis\nbrings existing theory and practice closer by considering the role of gradient\ndescent in KD and by demonstrating the exaggerated bias effect in both\ntheoretical and empirical settings.\n","authors":["Vaishnavh Nagarajan","Aditya Krishna Menon","Srinadh Bhojanapalli","Hossein Mobahi","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2301.12923v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00280v1","updated":"2023-08-01T04:37:08Z","published":"2023-08-01T04:37:08Z","title":"Data Collaboration Analysis applied to Compound Datasets and the\n  Introduction of Projection data to Non-IID settings","summary":"  Given the time and expense associated with bringing a drug to market,\nnumerous studies have been conducted to predict the properties of compounds\nbased on their structure using machine learning. Federated learning has been\napplied to compound datasets to increase their prediction accuracy while\nsafeguarding potentially proprietary information. However, federated learning\nis encumbered by low accuracy in not identically and independently distributed\n(non-IID) settings, i.e., data partitioning has a large label bias, and is\nconsidered unsuitable for compound datasets, which tend to have large label\nbias. To address this limitation, we utilized an alternative method of\ndistributed machine learning to chemical compound data from open sources,\ncalled data collaboration analysis (DC). We also proposed data collaboration\nanalysis using projection data (DCPd), which is an improved method that\nutilizes auxiliary PubChem data. This improves the quality of individual\nuser-side data transformations for the projection data for the creation of\nintermediate representations. The classification accuracy, i.e., area under the\ncurve in the receiver operating characteristic curve (ROC-AUC) and AUC in the\nprecision-recall curve (PR-AUC), of federated averaging (FedAvg), DC, and DCPd\nwas compared for five compound datasets. We determined that the machine\nlearning performance for non-IID settings was in the order of DCPd, DC, and\nFedAvg, although they were almost the same in identically and independently\ndistributed (IID) settings. Moreover, the results showed that compared to other\nmethods, DCPd exhibited a negligible decline in classification accuracy in\nexperiments with different degrees of label bias. Thus, DCPd can address the\nlow performance in non-IID settings, which is one of the challenges of\nfederated learning.\n","authors":["Akihiro Mizoguchi","Anna Bogdanova","Akira Imakura","Tetsuya Sakurai"],"pdf_url":"https://arxiv.org/pdf/2308.00280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00279v1","updated":"2023-08-01T04:34:52Z","published":"2023-08-01T04:34:52Z","title":"Robust Positive-Unlabeled Learning via Noise Negative Sample\n  Self-correction","summary":"  Learning from positive and unlabeled data is known as positive-unlabeled (PU)\nlearning in literature and has attracted much attention in recent years. One\ncommon approach in PU learning is to sample a set of pseudo-negatives from the\nunlabeled data using ad-hoc thresholds so that conventional supervised methods\ncan be applied with both positive and negative samples. Owing to the label\nuncertainty among the unlabeled data, errors of misclassifying unlabeled\npositive samples as negative samples inevitably appear and may even accumulate\nduring the training processes. Those errors often lead to performance\ndegradation and model instability. To mitigate the impact of label uncertainty\nand improve the robustness of learning with positive and unlabeled data, we\npropose a new robust PU learning method with a training strategy motivated by\nthe nature of human learning: easy cases should be learned first. Similar\nintuition has been utilized in curriculum learning to only use easier cases in\nthe early stage of training before introducing more complex cases.\nSpecifically, we utilize a novel ``hardness'' measure to distinguish unlabeled\nsamples with a high chance of being negative from unlabeled samples with large\nlabel noise. An iterative training strategy is then implemented to fine-tune\nthe selection of negative samples during the training process in an iterative\nmanner to include more ``easy'' samples in the early stage of training.\nExtensive experimental validations over a wide range of learning tasks show\nthat this approach can effectively improve the accuracy and stability of\nlearning with positive and unlabeled data. Our code is available at\nhttps://github.com/woriazzc/Robust-PU\n","authors":["Zhangchi Zhu","Lu Wang","Pu Zhao","Chao Du","Wei Zhang","Hang Dong","Bo Qiao","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00279v1.pdf","comment":"Accepted at KDD2023"},{"id":"http://arxiv.org/abs/2308.00278v1","updated":"2023-08-01T04:33:16Z","published":"2023-08-01T04:33:16Z","title":"Classes are not Clusters: Improving Label-based Evaluation of\n  Dimensionality Reduction","summary":"  A common way to evaluate the reliability of dimensionality reduction (DR)\nembeddings is to quantify how well labeled classes form compact, mutually\nseparated clusters in the embeddings. This approach is based on the assumption\nthat the classes stay as clear clusters in the original high-dimensional space.\nHowever, in reality, this assumption can be violated; a single class can be\nfragmented into multiple separated clusters, and multiple classes can be merged\ninto a single cluster. We thus cannot always assure the credibility of the\nevaluation using class labels. In this paper, we introduce two novel quality\nmeasures -- Label-Trustworthiness and Label-Continuity (Label-T&C) -- advancing\nthe process of DR evaluation based on class labels. Instead of assuming that\nclasses are well-clustered in the original space, Label-T&C work by (1)\nestimating the extent to which classes form clusters in the original and\nembedded spaces and (2) evaluating the difference between the two. A\nquantitative evaluation showed that Label-T&C outperform widely used DR\nevaluation measures (e.g., Trustworthiness and Continuity, Kullback-Leibler\ndivergence) in terms of the accuracy in assessing how well DR embeddings\npreserve the cluster structure, and are also scalable. Moreover, we present\ncase studies demonstrating that Label-T&C can be successfully used for\nrevealing the intrinsic characteristics of DR techniques and their\nhyperparameters.\n","authors":["Hyeon Jeon","Yun-Hsin Kuo","Michaël Aupetit","Kwan-Liu Ma","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00278v1.pdf","comment":"IEEE Transactions on Visualization and Computer Graphics (TVCG)\n  (Proc. IEEE VIS 2023)"},{"id":"http://arxiv.org/abs/2308.00273v1","updated":"2023-08-01T04:11:19Z","published":"2023-08-01T04:11:19Z","title":"Neural approximation of Wasserstein distance via a universal\n  architecture for symmetric and factorwise group invariant functions","summary":"  Learning distance functions between complex objects, such as the Wasserstein\ndistance to compare point sets, is a common goal in machine learning\napplications. However, functions on such complex objects (e.g., point sets and\ngraphs) are often required to be invariant to a wide variety of group actions\ne.g. permutation or rigid transformation. Therefore, continuous and symmetric\nproduct functions (such as distance functions) on such complex objects must\nalso be invariant to the product of such group actions. We call these functions\nsymmetric and factor-wise group invariant (or SFGI functions in short). In this\npaper, we first present a general neural network architecture for approximating\nSFGI functions. The main contribution of this paper combines this general\nneural network with a sketching idea to develop a specific and efficient neural\nnetwork which can approximate the $p$-th Wasserstein distance between point\nsets. Very importantly, the required model complexity is independent of the\nsizes of input point sets. On the theoretical front, to the best of our\nknowledge, this is the first result showing that there exists a neural network\nwith the capacity to approximate Wasserstein distance with bounded model\ncomplexity. Our work provides an interesting integration of sketching ideas for\ngeometric problems with universal approximation of symmetric functions. On the\nempirical front, we present a range of results showing that our newly proposed\nneural network architecture performs comparatively or better than other models\n(including a SOTA Siamese Autoencoder based approach). In particular, our\nneural network generalizes significantly better and trains much faster than the\nSOTA Siamese AE. Finally, this line of investigation could be useful in\nexploring effective neural network design for solving a broad range of\ngeometric optimization problems (e.g., $k$-means in a metric space).\n","authors":["Samantha Chen","Yusu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.00273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09738v6","updated":"2023-08-01T04:05:24Z","published":"2023-02-20T03:31:11Z","title":"Simplifying Momentum-based Positive-definite Submanifold Optimization\n  with Applications to Deep Learning","summary":"  Riemannian submanifold optimization with momentum is computationally\nchallenging because, to ensure that the iterates remain on the submanifold, we\noften need to solve difficult differential equations. Here, we simplify such\ndifficulties for a class of sparse or structured symmetric positive-definite\nmatrices with the affine-invariant metric. We do so by proposing a generalized\nversion of the Riemannian normal coordinates that dynamically orthonormalizes\nthe metric and locally converts the problem into an unconstrained problem in\nthe Euclidean space. We use our approach to simplify existing approaches for\nstructured covariances and develop matrix-inverse-free $2^\\text{nd}$-order\noptimizers for deep learning with low precision by using only matrix\nmultiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL\n","authors":["Wu Lin","Valentin Duruisseaux","Melvin Leok","Frank Nielsen","Mohammad Emtiyaz Khan","Mark Schmidt"],"pdf_url":"https://arxiv.org/pdf/2302.09738v6.pdf","comment":"An updated version of the ICML 2023 paper. Updated the main text to\n  emphasize challenges of using existing Riemannian methods to estimate sparse\n  and structured SPD matrices"},{"id":"http://arxiv.org/abs/2207.11749v2","updated":"2023-08-01T03:54:39Z","published":"2022-07-24T14:04:34Z","title":"Source Separation of Unknown Numbers of Single-Channel Underwater\n  Acoustic Signals Based on Autoencoders","summary":"  Few existing studies focus on the source separation problem with unknown\nnumbers of signals, and how to evaluate the performances of the systems is not\nyet clear. We propose a solution with a fixed number of output channels to\naddress these two problems, enabling it to avoid the dimensional disaster\ncaused by the permutation problem induced by the alignment of outputs to\ntargets. Specifically, we propose a two-step algorithm based on autoencoders\nand a new performance evaluation method for situations with mute channels.\nExperiments conducted on simulated mixtures of radiated ship noise show that\nthe proposed solution can achieve similar separation performance to that\nattained with a known number of signals. The proposed algorithm achieved\ncompetitive performance as two algorithms developed for known numbers of\nsignals, which is highly explainable and extensible and get the state of the\nart under this framework.\n","authors":["Qinggang Sun","Kejun Wang"],"pdf_url":"https://arxiv.org/pdf/2207.11749v2.pdf","comment":"6 pages, 1 figure, 3 tables. For codes, see\n  https://github.com/QinggangSUN/unknown_number_source_separation. This work\n  has been submitted to the IEEE for possible publication. Copyright may be\n  transferred without notice, after which this version may no longer be\n  accessible"},{"id":"http://arxiv.org/abs/2308.00264v1","updated":"2023-08-01T03:54:27Z","published":"2023-08-01T03:54:27Z","title":"Multi-Modality Multi-Loss Fusion Network","summary":"  In this work we investigate the optimal selection and fusion of features\nacross multiple modalities and combine these in a neural network to improve\nemotion detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nuseful findings relating to subnet performance. Our best model achieves\nstate-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and\nCH-SIMS), and outperforms the other methods in most metrics. We have found that\ntraining on multimodal features improves single modality testing and designing\nfusion methods based on dataset annotation schema enhances model performance.\nThese results suggest a roadmap towards an optimized feature selection and\nfusion approach for enhancing emotion detection in neural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v1.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2308.00263v1","updated":"2023-08-01T03:50:58Z","published":"2023-08-01T03:50:58Z","title":"Asynchronous Federated Learning with Bidirectional Quantized\n  Communications and Buffered Aggregation","summary":"  Asynchronous Federated Learning with Buffered Aggregation (FedBuff) is a\nstate-of-the-art algorithm known for its efficiency and high scalability.\nHowever, it has a high communication cost, which has not been examined with\nquantized communications. To tackle this problem, we present a new algorithm\n(QAFeL), with a quantization scheme that establishes a shared \"hidden\" state\nbetween the server and clients to avoid the error propagation caused by direct\nquantization. This approach allows for high precision while significantly\nreducing the data transmitted during client-server interactions. We provide\ntheoretical convergence guarantees for QAFeL and corroborate our analysis with\nexperiments on a standard benchmark.\n","authors":["Tomas Ortega","Hamid Jafarkhani"],"pdf_url":"https://arxiv.org/pdf/2308.00263v1.pdf","comment":"Accepted at the 2023 ICML Workshop of Federated Learning and\n  Analytics in Practice"},{"id":"http://arxiv.org/abs/2112.00337v2","updated":"2023-08-01T03:45:57Z","published":"2021-12-01T08:07:01Z","title":"A Unified Benchmark for the Unknown Detection Capability of Deep Neural\n  Networks","summary":"  Deep neural networks have achieved outstanding performance over various\ntasks, but they have a critical issue: over-confident predictions even for\ncompletely unknown samples. Many studies have been proposed to successfully\nfilter out these unknown samples, but they only considered narrow and specific\ntasks, referred to as misclassification detection, open-set recognition, or\nout-of-distribution detection. In this work, we argue that these tasks should\nbe treated as fundamentally an identical problem because an ideal model should\npossess detection capability for all those tasks. Therefore, we introduce the\nunknown detection task, an integration of previous individual tasks, for a\nrigorous examination of the detection capability of deep neural networks on a\nwide spectrum of unknown samples. To this end, unified benchmark datasets on\ndifferent scales were constructed and the unknown detection capabilities of\nexisting popular methods were subject to comparison. We found that Deep\nEnsemble consistently outperforms the other approaches in detecting unknowns;\nhowever, all methods are only successful for a specific type of unknown. The\nreproducible code and benchmark datasets are available at\nhttps://github.com/daintlab/unknown-detection-benchmarks .\n","authors":["Jihyo Kim","Jiin Koo","Sangheum Hwang"],"pdf_url":"https://arxiv.org/pdf/2112.00337v2.pdf","comment":"Published in ESWA\n  (https://www.sciencedirect.com/science/article/pii/S0957417423009636)"},{"id":"http://arxiv.org/abs/2308.00258v1","updated":"2023-08-01T03:41:47Z","published":"2023-08-01T03:41:47Z","title":"AQUILA: Communication Efficient Federated Learning with Adaptive\n  Quantization of Lazily-Aggregated Gradients","summary":"  The widespread adoption of Federated Learning (FL), a privacy-preserving\ndistributed learning methodology, has been impeded by the challenge of high\ncommunication overheads, typically arising from the transmission of large-scale\nmodels. Existing adaptive quantization methods, designed to mitigate these\noverheads, operate under the impractical assumption of uniform device\nparticipation in every training round. Additionally, these methods are limited\nin their adaptability due to the necessity of manual quantization level\nselection and often overlook biases inherent in local devices' data, thereby\naffecting the robustness of the global model. In response, this paper\nintroduces AQUILA (adaptive quantization of lazily-aggregated gradients), a\nnovel adaptive framework devised to effectively handle these issues, enhancing\nthe efficiency and robustness of FL. AQUILA integrates a sophisticated device\nselection method that prioritizes the quality and usefulness of device updates.\nUtilizing the exact global model stored by devices, it enables a more precise\ndevice selection criterion, reduces model deviation, and limits the need for\nhyperparameter adjustments. Furthermore, AQUILA presents an innovative\nquantization criterion, optimized to improve communication efficiency while\nassuring model convergence. Our experiments demonstrate that AQUILA\nsignificantly decreases communication costs compared to existing methods, while\nmaintaining comparable model performance across diverse non-homogeneous FL\nsettings, such as Non-IID data and heterogeneous model architectures.\n","authors":["Zihao Zhao","Yuzhu Mao","Zhenpeng Shi","Yang Liu","Tian Lan","Wenbo Ding","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00251v1","updated":"2023-08-01T03:11:31Z","published":"2023-08-01T03:11:31Z","title":"Best-Subset Selection in Generalized Linear Models: A Fast and\n  Consistent Algorithm via Splicing Technique","summary":"  In high-dimensional generalized linear models, it is crucial to identify a\nsparse model that adequately accounts for response variation. Although the best\nsubset section has been widely regarded as the Holy Grail of problems of this\ntype, achieving either computational efficiency or statistical guarantees is\nchallenging. In this article, we intend to surmount this obstacle by utilizing\na fast algorithm to select the best subset with high certainty. We proposed and\nillustrated an algorithm for best subset recovery in regularity conditions.\nUnder mild conditions, the computational complexity of our algorithm scales\npolynomially with sample size and dimension. In addition to demonstrating the\nstatistical properties of our method, extensive numerical experiments reveal\nthat it outperforms existing methods for variable selection and coefficient\nestimation. The runtime analysis shows that our implementation achieves\napproximately a fourfold speedup compared to popular variable selection\ntoolkits like glmnet and ncvreg.\n","authors":["Junxian Zhu","Jin Zhu","Borui Tang","Xuanyu Chen","Hongmei Lin","Xueqin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.00251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00246v1","updated":"2023-08-01T02:59:19Z","published":"2023-08-01T02:59:19Z","title":"EEG-based Cognitive Load Classification using Feature Masked\n  Autoencoding and Emotion Transfer Learning","summary":"  Cognitive load, the amount of mental effort required for task completion,\nplays an important role in performance and decision-making outcomes, making its\nclassification and analysis essential in various sensitive domains. In this\npaper, we present a new solution for the classification of cognitive load using\nelectroencephalogram (EEG). Our model uses a transformer architecture employing\ntransfer learning between emotions and cognitive load. We pre-train our model\nusing self-supervised masked autoencoding on emotion-related EEG datasets and\nuse transfer learning with both frozen weights and fine-tuning to perform\ndownstream cognitive load classification. To evaluate our method, we carry out\na series of experiments utilizing two publicly available EEG-based emotion\ndatasets, namely SEED and SEED-IV, for pre-training, while we use the CL-Drive\ndataset for downstream cognitive load classification. The results of our\nexperiments show that our proposed approach achieves strong results and\noutperforms conventional single-stage fully supervised learning. Moreover, we\nperform detailed ablation and sensitivity studies to evaluate the impact of\ndifferent aspects of our proposed solution. This research contributes to the\ngrowing body of literature in affective computing with a focus on cognitive\nload, and opens up new avenues for future research in the field of cross-domain\ntransfer learning using self-supervised pre-training.\n","authors":["Dustin Pulver","Prithila Angkan","Paul Hungler","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2308.00246v1.pdf","comment":"This paper has been accepted to the 25th International Conference on\n  Multimodal Interaction (ICMI 2023). 8 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/1801.08640v4","updated":"2023-08-01T02:35:52Z","published":"2018-01-26T00:23:20Z","title":"Considerations When Learning Additive Explanations for Black-Box Models","summary":"  Many methods to explain black-box models, whether local or global, are\nadditive. In this paper, we study global additive explanations for non-additive\nmodels, focusing on four explanation methods: partial dependence, Shapley\nexplanations adapted to a global setting, distilled additive explanations, and\ngradient-based explanations. We show that different explanation methods\ncharacterize non-additive components in a black-box model's prediction function\nin different ways. We use the concepts of main and total effects to anchor\nadditive explanations, and quantitatively evaluate additive and non-additive\nexplanations. Even though distilled explanations are generally the most\naccurate additive explanations, non-additive explanations such as tree\nexplanations that explicitly model non-additive components tend to be even more\naccurate. Despite this, our user study showed that machine learning\npractitioners were better able to leverage additive explanations for various\ntasks. These considerations should be taken into account when considering which\nexplanation to trust and use to explain black-box models.\n","authors":["Sarah Tan","Giles Hooker","Paul Koch","Albert Gordo","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/1801.08640v4.pdf","comment":"Published at Machine Learning (2023). Previously titled \"Learning\n  Global Additive Explanations for Neural Nets Using Model Distillation\". A\n  short version was presented at NeurIPS 2018 Machine Learning for Health\n  Workshop"},{"id":"http://arxiv.org/abs/2308.00231v1","updated":"2023-08-01T02:07:47Z","published":"2023-08-01T02:07:47Z","title":"Capsa: A Unified Framework for Quantifying Risk in Deep Neural Networks","summary":"  The modern pervasiveness of large-scale deep neural networks (NNs) is driven\nby their extraordinary performance on complex problems but is also plagued by\ntheir sudden, unexpected, and often catastrophic failures, particularly on\nchallenging scenarios. Existing algorithms that provide risk-awareness to NNs\nare complex and ad-hoc. Specifically, these methods require significant\nengineering changes, are often developed only for particular settings, and are\nnot easily composable. Here we present capsa, a framework for extending models\nwith risk-awareness. Capsa provides a methodology for quantifying multiple\nforms of risk and composing different algorithms together to quantify different\nrisk metrics in parallel. We validate capsa by implementing state-of-the-art\nuncertainty estimation algorithms within the capsa framework and benchmarking\nthem on complex perception datasets. We demonstrate capsa's ability to easily\ncompose aleatoric uncertainty, epistemic uncertainty, and bias estimation\ntogether in a single procedure, and show how this approach provides a\ncomprehensive awareness of NN risk.\n","authors":["Sadhana Lolla","Iaroslav Elistratov","Alejandro Perez","Elaheh Ahmadi","Daniela Rus","Alexander Amini"],"pdf_url":"https://arxiv.org/pdf/2308.00231v1.pdf","comment":"Neural Information Processing Systems (NeurIPS) 2022. Workshop on\n  Machine Learning for Autonomous Driving (ML4AD)"},{"id":"http://arxiv.org/abs/2308.00225v1","updated":"2023-08-01T01:39:25Z","published":"2023-08-01T01:39:25Z","title":"Instructed to Bias: Instruction-Tuned Language Models Exhibit Emergent\n  Cognitive Bias","summary":"  Recent studies show that instruction tuning and learning from human feedback\nimprove the abilities of large language models (LMs) dramatically. While these\ntuning methods can make models generate high-quality text, we conjecture that\nmore implicit cognitive biases may arise in these fine-tuned models. Our work\nprovides evidence that these fine-tuned models exhibit biases that were absent\nor less pronounced in their pretrained predecessors. We examine the extent of\nthis phenomenon in three cognitive biases - the decoy effect, the certainty\neffect, and the belief bias - all of which are known to influence human\ndecision-making and reasoning. Our findings highlight the presence of these\nbiases in various models, especially those that have undergone instruction\ntuning, such as Flan-T5, GPT3.5, and GPT4. This research constitutes a step\ntoward comprehending cognitive biases in instruction-tuned LMs, which is\ncrucial for the development of more reliable and unbiased language models.\n","authors":["Itay Itzhak","Gabriel Stanovsky","Nir Rosenfeld","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2308.00225v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.00218v1","updated":"2023-08-01T01:19:56Z","published":"2023-08-01T01:19:56Z","title":"Deep Reinforcement Learning-Based Battery Conditioning Hierarchical V2G\n  Coordination for Multi-Stakeholder Benefits","summary":"  With the growing prevalence of electric vehicles (EVs) and advancements in EV\nelectronics, vehicle-to-grid (V2G) techniques and large-scale scheduling\nstrategies have emerged to promote renewable energy utilization and power grid\nstability. This study proposes a multi-stakeholder hierarchical V2G\ncoordination based on deep reinforcement learning (DRL) and the Proof of Stake\nalgorithm. Furthermore, the multi-stakeholders include the power grid, EV\naggregators (EVAs), and users, and the proposed strategy can achieve\nmulti-stakeholder benefits. On the grid side, load fluctuations and renewable\nenergy consumption are considered, while on the EVA side, energy constraints\nand charging costs are considered. The three critical battery conditioning\nparameters of battery SOX are considered on the user side, including state of\ncharge, state of power, and state of health. Compared with four typical\nbaselines, the multi-stakeholder hierarchical coordination strategy can enhance\nrenewable energy consumption, mitigate load fluctuations, meet the energy\ndemands of EVA, and reduce charging costs and battery degradation under\nrealistic operating conditions.\n","authors":["Yubao Zhang","Xin Chen","Yi Gu","Zhicheng Li","Wu Kai"],"pdf_url":"https://arxiv.org/pdf/2308.00218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00214v1","updated":"2023-08-01T01:12:29Z","published":"2023-08-01T01:12:29Z","title":"Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned\n  Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF)","summary":"  Many tasks performed in image-guided, mini-invasive, medical procedures can\nbe cast as pose estimation problems, where an X-ray projection is utilized to\nreach a target in 3D space. Recent advances in the differentiable rendering of\noptically reflective materials have enabled state-of-the-art performance in RGB\ncamera view synthesis and pose estimation. Expanding on these prior works, we\nintroduce new methods for pose estimation of radiolucent objects using X-ray\nprojections, and we demonstrate the critical role of optimal view synthesis in\nperforming this task. We first develop an algorithm (DiffDRR) that efficiently\ncomputes Digitally Reconstructed Radiographs (DRRs) and leverages automatic\ndifferentiation within TensorFlow. In conjunction with classic CBCT\nreconstruction algorithms, we perform pose estimation by gradient descent using\na loss function that quantifies the similarity of the DRR synthesized from a\nrandomly initialized pose and the true fluoroscopic image at the target pose.\nWe propose two novel methods for high-fidelity view synthesis, Neural Tuned\nTomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely\non classic CBCT; NeTT directly optimizes the CBCT densities, while the non-zero\nvalues of mNeRF are constrained by a 3D mask of the anatomic region segmented\nfrom CBCT. We demonstrate that both NeTT and mNeRF distinctly improve pose\nestimation within our framework. By defining a successful pose estimate to be a\n3D angle error of less than 3 deg, we find that NeTT and mNeRF can achieve\nsimilar results, both with overall success rates more than 93%. Furthermore, we\nshow that a NeTT trained for a single subject can generalize to synthesize\nhigh-fidelity DRRs and ensure robust pose estimations for all other subjects.\nTherefore, we suggest that NeTT is an attractive option for robust pose\nestimation using fluoroscopic projections.\n","authors":["Chaochao Zhou","Syed Hasib Akhter Faruqui","Abhinav Patel","Ramez N. Abdalla","Michael C. Hurley","Ali Shaibani","Matthew B. Potts","Babak S. Jahromi","Leon Cho","Sameer A. Ansari","Donald R. Cantrell"],"pdf_url":"https://arxiv.org/pdf/2308.00214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06251v2","updated":"2023-08-01T00:45:25Z","published":"2023-01-16T04:11:14Z","title":"Machine Learning-Aided Efficient Decoding of Reed-Muller Subcodes","summary":"  Reed-Muller (RM) codes achieve the capacity of general binary-input\nmemoryless symmetric channels and are conjectured to have a comparable\nperformance to that of random codes in terms of scaling laws. However, such\nresults are established assuming maximum-likelihood decoders for general code\nparameters. Also, RM codes only admit limited sets of rates. Efficient decoders\nsuch as successive cancellation list (SCL) decoder and recently-introduced\nrecursive projection-aggregation (RPA) decoders are available for RM codes at\nfinite lengths. In this paper, we focus on subcodes of RM codes with flexible\nrates. We first extend the RPA decoding algorithm to RM subcodes. To lower the\ncomplexity of our decoding algorithm, referred to as subRPA, we investigate\ndifferent approaches to prune the projections. Next, we derive the\nsoft-decision based version of our algorithm, called soft-subRPA, that not only\nimproves upon the performance of subRPA but also enables a differentiable\ndecoding algorithm. Building upon the soft-subRPA algorithm, we then provide a\nframework for training a machine learning (ML) model to search for\n\\textit{good} sets of projections that minimize the decoding error rate.\nTraining our ML model enables achieving very close to the performance of\nfull-projection decoding with a significantly smaller number of projections. We\nalso show that the choice of the projections in decoding RM subcodes matters\nsignificantly, and our ML-aided projection pruning scheme is able to find a\n\\textit{good} selection, i.e., with negligible performance degradation compared\nto the full-projection case, given a reasonable number of projections.\n","authors":["Mohammad Vahid Jamali","Xiyang Liu","Ashok Vardhan Makkuva","Hessam Mahdavifar","Sewoong Oh","Pramod Viswanath"],"pdf_url":"https://arxiv.org/pdf/2301.06251v2.pdf","comment":"Accepted for publication in the Journal on Selected Areas in\n  Information Theory. arXiv admin note: substantial text overlap with\n  arXiv:2102.01671"},{"id":"http://arxiv.org/abs/2307.07181v3","updated":"2023-08-01T00:45:24Z","published":"2023-07-14T06:21:03Z","title":"DISPEL: Domain Generalization via Domain-Specific Liberating","summary":"  Domain generalization aims to learn a generalization model that can perform\nwell on unseen test domains by only training on limited source domains.\nHowever, existing domain generalization approaches often bring in\nprediction-irrelevant noise or require the collection of domain labels. To\naddress these challenges, we consider the domain generalization problem from a\ndifferent perspective by categorizing underlying feature groups into\ndomain-shared and domain-specific features. Nevertheless, the domain-specific\nfeatures are difficult to be identified and distinguished from the input data.\nIn this work, we propose DomaIn-SPEcific Liberating (DISPEL), a post-processing\nfine-grained masking approach that can filter out undefined and\nindistinguishable domain-specific features in the embedding space.\nSpecifically, DISPEL utilizes a mask generator that produces a unique mask for\neach input data to filter domain-specific features. The DISPEL framework is\nhighly flexible to be applied to any fine-tuned models. We derive a\ngeneralization error bound to guarantee the generalization performance by\noptimizing a designed objective loss. The experimental results on five\nbenchmarks demonstrate DISPEL outperforms existing methods and can further\ngeneralize various algorithms.\n","authors":["Chia-Yuan Chang","Yu-Neng Chuang","Guanchu Wang","Mengnan Du","Na Zou"],"pdf_url":"https://arxiv.org/pdf/2307.07181v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12351v3","updated":"2023-08-01T00:35:59Z","published":"2021-10-24T04:49:35Z","title":"Integrated Conditional Estimation-Optimization","summary":"  Many real-world optimization problems involve uncertain parameters with\nprobability distributions that can be estimated using contextual feature\ninformation. In contrast to the standard approach of first estimating the\ndistribution of uncertain parameters and then optimizing the objective based on\nthe estimation, we propose an integrated conditional estimation-optimization\n(ICEO) framework that estimates the underlying conditional distribution of the\nrandom parameter while considering the structure of the optimization problem.\nWe directly model the relationship between the conditional distribution of the\nrandom parameter and the contextual features, and then estimate the\nprobabilistic model with an objective that aligns with the downstream\noptimization problem. We show that our ICEO approach is asymptotically\nconsistent under moderate regularity conditions and further provide finite\nperformance guarantees in the form of generalization bounds. Computationally,\nperforming estimation with the ICEO approach is a non-convex and often\nnon-differentiable optimization problem. We propose a general methodology for\napproximating the potentially non-differentiable mapping from estimated\nconditional distribution to the optimal decision by a differentiable function,\nwhich greatly improves the performance of gradient-based algorithms applied to\nthe non-convex problem. We also provide a polynomial optimization solution\napproach in the semi-algebraic case. Numerical experiments are also conducted\nto show the empirical success of our approach in different situations including\nwith limited data samples and model mismatches.\n","authors":["Paul Grigas","Meng Qi","Zuo-Jun Max Shen"],"pdf_url":"https://arxiv.org/pdf/2110.12351v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00206v1","updated":"2023-08-01T00:05:02Z","published":"2023-08-01T00:05:02Z","title":"SkullGAN: Synthetic Skull CT Generation with Generative Adversarial\n  Networks","summary":"  Deep learning offers potential for various healthcare applications involving\nthe human skull but requires extensive datasets of curated medical images. To\novercome this challenge, we propose SkullGAN, a generative adversarial network\n(GAN), to create large datasets of synthetic skull CT slices, reducing reliance\non real images and accelerating the integration of machine learning into\nhealthcare. In our method, CT slices of 38 subjects were fed to SkullGAN, a\nneural network comprising over 200 million parameters. The synthetic skull\nimages generated were evaluated based on three quantitative radiological\nfeatures: skull density ratio (SDR), mean thickness, and mean intensity. They\nwere further analyzed using t-distributed stochastic neighbor embedding (t-SNE)\nand by applying the SkullGAN discriminator as a classifier. The results showed\nthat SkullGAN-generated images demonstrated similar key quantitative\nradiological features to real skulls. Further definitive analysis was\nundertaken by applying the discriminator of SkullGAN, where the SkullGAN\ndiscriminator classified 56.5% of a test set of real skull images and 55.9% of\nthe SkullGAN-generated images as reals (the theoretical optimum being 50%),\ndemonstrating that the SkullGAN-generated skull set is indistinguishable from\nthe real skull set - within the limits of our nonlinear classifier. Therefore,\nSkullGAN makes it possible to generate large numbers of synthetic skull CT\nsegments, necessary for training neural networks for medical applications\ninvolving the human skull. This mitigates challenges associated with preparing\nlarge, high-quality training datasets, such as access, capital, time, and the\nneed for domain expertise.\n","authors":["Kasra Naftchi-Ardebili","Karanpartap Singh","Reza Pourabolghasem","Pejman Ghanouni","Gerald R. Popelka","Kim Butts Pauly"],"pdf_url":"https://arxiv.org/pdf/2308.00206v1.pdf","comment":"The first two authors contributed equally. This work has been\n  submitted to Radiology: Artificial Intelligence for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2305.18624v5","updated":"2023-08-01T00:04:32Z","published":"2023-05-29T21:17:52Z","title":"W-procer: Weighted Prototypical Contrastive Learning for Medical\n  Few-Shot Named Entity Recognition","summary":"  Contrastive learning has become a popular solution for few-shot Name Entity\nRecognization (NER). The conventional configuration strives to reduce the\ndistance between tokens with the same labels and increase the distance between\ntokens with different labels. The effect of this setup may, however, in the\nmedical domain, there are a lot of entities annotated as OUTSIDE (O), and they\nare undesirably pushed apart to other entities that are not labeled as OUTSIDE\n(O) by the current contrastive learning method end up with a noisy prototype\nfor the semantic representation of the label, though there are many OUTSIDE (O)\nlabeled entities are relevant to the labeled entities. To address this\nchallenge, we propose a novel method named Weighted Prototypical Contrastive\nLearning for Medical Few Shot Named Entity Recognization (W-PROCER). Our\napproach primarily revolves around constructing the prototype-based contractive\nloss and weighting network. These components play a crucial role in assisting\nthe model in differentiating the negative samples from OUTSIDE (O) tokens and\nenhancing the discrimination ability of contrastive learning. Experimental\nresults show that our proposed W-PROCER framework significantly outperforms the\nstrong baselines on the three medical benchmark datasets.\n","authors":["Mingchen Li","Yang Ye","Jeremy Yeung","Huixue Zhou","Huaiyuan Chu","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.18624v5.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2202.01980v2","updated":"2023-08-01T00:02:15Z","published":"2022-02-04T05:15:52Z","title":"Multi-Output Gaussian Process-Based Data Augmentation for Multi-Building\n  and Multi-Floor Indoor Localization","summary":"  Location fingerprinting based on RSSI becomes a mainstream indoor\nlocalization technique due to its advantage of not requiring the installation\nof new infrastructure and the modification of existing devices, especially\ngiven the prevalence of Wi-Fi-enabled devices and the ubiquitous Wi-Fi access\nin modern buildings. The use of AI/ML technologies like DNNs makes location\nfingerprinting more accurate and reliable, especially for large-scale\nmulti-building and multi-floor indoor localization. The application of DNNs for\nindoor localization, however, depends on a large amount of preprocessed and\ndeliberately-labeled data for their training. Considering the difficulty of the\ndata collection in an indoor environment, especially under the current epidemic\nsituation of COVID-19, we investigate three different methods of RSSI data\naugmentation based on Multi-Output Gaussian Process (MOGP), i.e., by a single\nfloor, by neighboring floors, and by a single building; unlike Single-Output\nGaussian Process (SOGP), MOGP can take into account the correlation among RSSI\nobservations from multiple Access Points (APs) deployed closely to each other\n(e.g., APs on the same floor of a building) by collectively handling them. The\nfeasibility of the MOGP-based RSSI data augmentation is demonstrated through\nexperiments based on the state-of-the-art RNN indoor localization model and the\nUJIIndoorLoc, i.e., the most popular publicly-available multi-building and\nmulti-floor indoor localization database, where the RNN model trained with the\nUJIIndoorLoc database augmented by using the whole RSSI data of a building in\nfitting an MOGP model (i.e., by a single building) outperforms the other two\naugmentation methods as well as the RNN model trained with the original\nUJIIndoorLoc database, resulting in the mean three-dimensional positioning\nerror of 8.42 m.\n","authors":["Zhe Tang","Sihao Li","Kyeong Soo Kim","Jeremy Smith"],"pdf_url":"https://arxiv.org/pdf/2202.01980v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.14604v2","updated":"2023-08-01T23:48:02Z","published":"2023-03-26T02:23:38Z","title":"Green Federated Learning","summary":"  The rapid progress of AI is fueled by increasingly large and computationally\nintensive machine learning models and datasets. As a consequence, the amount of\ncompute used in training state-of-the-art models is exponentially increasing\n(doubling every 10 months between 2015 and 2022), resulting in a large carbon\nfootprint. Federated Learning (FL) - a collaborative machine learning technique\nfor training a centralized model using data of decentralized entities - can\nalso be resource-intensive and have a significant carbon footprint,\nparticularly when deployed at scale. Unlike centralized AI that can reliably\ntap into renewables at strategically placed data centers, cross-device FL may\nleverage as many as hundreds of millions of globally distributed end-user\ndevices with diverse energy sources. Green AI is a novel and important research\narea where carbon footprint is regarded as an evaluation criterion for AI,\nalongside accuracy, convergence speed, and other metrics. In this paper, we\npropose the concept of Green FL, which involves optimizing FL parameters and\nmaking design choices to minimize carbon emissions consistent with competitive\nperformance and training time. The contributions of this work are two-fold.\nFirst, we adopt a data-driven approach to quantify the carbon emissions of FL\nby directly measuring real-world at-scale FL tasks running on millions of\nphones. Second, we present challenges, guidelines, and lessons learned from\nstudying the trade-off between energy efficiency, performance, and\ntime-to-train in a production FL system. Our findings offer valuable insights\ninto how FL can reduce its carbon footprint, and they provide a foundation for\nfuture research in the area of Green AI.\n","authors":["Ashkan Yousefpour","Shen Guo","Ashish Shenoy","Sayan Ghosh","Pierre Stock","Kiwan Maeng","Schalk-Willem Krüger","Michael Rabbat","Carole-Jean Wu","Ilya Mironov"],"pdf_url":"https://arxiv.org/pdf/2303.14604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00864v1","updated":"2023-08-01T22:25:40Z","published":"2023-08-01T22:25:40Z","title":"PeRP: Personalized Residual Policies For Congestion Mitigation Through\n  Co-operative Advisory Systems","summary":"  Intelligent driving systems can be used to mitigate congestion through simple\nactions, thus improving many socioeconomic factors such as commute time and gas\ncosts. However, these systems assume precise control over autonomous vehicle\nfleets, and are hence limited in practice as they fail to account for\nuncertainty in human behavior. Piecewise Constant (PC) Policies address these\nissues by structurally modeling the likeness of human driving to reduce traffic\ncongestion in dense scenarios to provide action advice to be followed by human\ndrivers. However, PC policies assume that all drivers behave similarly. To this\nend, we develop a co-operative advisory system based on PC policies with a\nnovel driver trait conditioned Personalized Residual Policy, PeRP. PeRP advises\ndrivers to behave in ways that mitigate traffic congestion. We first infer the\ndriver's intrinsic traits on how they follow instructions in an unsupervised\nmanner with a variational autoencoder. Then, a policy conditioned on the\ninferred trait adapts the action of the PC policy to provide the driver with a\npersonalized recommendation. Our system is trained in simulation with novel\ndriver modeling of instruction adherence. We show that our approach\nsuccessfully mitigates congestion while adapting to different driver behaviors,\nwith 4 to 22% improvement in average speed over baselines.\n","authors":["Aamir Hasan","Neeloy Chakraborty","Haonan Chen","Jung-Hoon Cho","Cathy Wu","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2308.00864v1.pdf","comment":"Accepted to ITSC 2023. Additional material and code is available at\n  the project webpage: https://sites.google.com/illinois.edu/perp"},{"id":"http://arxiv.org/abs/2308.00858v1","updated":"2023-08-01T22:12:30Z","published":"2023-08-01T22:12:30Z","title":"Understanding Activation Patterns in Artificial Neural Networks by\n  Exploring Stochastic Processes","summary":"  To gain a deeper understanding of the behavior and learning dynamics of\n(deep) artificial neural networks, it is valuable to employ mathematical\nabstractions and models. These tools provide a simplified perspective on\nnetwork performance and facilitate systematic investigations through\nsimulations. In this paper, we propose utilizing the framework of stochastic\nprocesses, which has been underutilized thus far.\n  Our approach models activation patterns of thresholded nodes in (deep)\nartificial neural networks as stochastic processes. We focus solely on\nactivation frequency, leveraging neuroscience techniques used for real neuron\nspike trains. During a classification task, we extract spiking activity and use\nan arrival process following the Poisson distribution.\n  We examine observed data from various artificial neural networks in image\nrecognition tasks, fitting the proposed model's assumptions. Through this, we\nderive parameters describing activation patterns in each network. Our analysis\ncovers randomly initialized, generalizing, and memorizing networks, revealing\nconsistent differences across architectures and training sets.\n  Calculating Mean Firing Rate, Mean Fano Factor, and Variances, we find stable\nindicators of memorization during learning, providing valuable insights into\nnetwork behavior. The proposed model shows promise in describing activation\npatterns and could serve as a general framework for future investigations. It\nhas potential applications in theoretical simulations, pruning, and transfer\nlearning.\n","authors":["Stephan Johann Lehmler","Muhammad Saif-ur-Rehman","Tobias Glasmachers","Ioannis Iossifidis"],"pdf_url":"https://arxiv.org/pdf/2308.00858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00856v1","updated":"2023-08-01T21:59:22Z","published":"2023-08-01T21:59:22Z","title":"Differential Privacy for Adaptive Weight Aggregation in Federated Tumor\n  Segmentation","summary":"  Federated Learning (FL) is a distributed machine learning approach that\nsafeguards privacy by creating an impartial global model while respecting the\nprivacy of individual client data. However, the conventional FL method can\nintroduce security risks when dealing with diverse client data, potentially\ncompromising privacy and data integrity. To address these challenges, we\npresent a differential privacy (DP) federated deep learning framework in\nmedical image segmentation. In this paper, we extend our similarity weight\naggregation (SimAgg) method to DP-SimAgg algorithm, a differentially private\nsimilarity-weighted aggregation algorithm for brain tumor segmentation in\nmulti-modal magnetic resonance imaging (MRI). Our DP-SimAgg method not only\nenhances model segmentation capabilities but also provides an additional layer\nof privacy preservation. Extensive benchmarking and evaluation of our\nframework, with computational performance as a key consideration, demonstrate\nthat DP-SimAgg enables accurate and robust brain tumor segmentation while\nminimizing communication costs during model training. This advancement is\ncrucial for preserving the privacy of medical image data and safeguarding\nsensitive information. In conclusion, adding a differential privacy layer in\nthe global weight aggregation phase of the federated brain tumor segmentation\nprovides a promising solution to privacy concerns without compromising\nsegmentation model efficacy. By leveraging DP, we ensure the protection of\nclient data against adversarial attacks and malicious participants.\n","authors":["Muhammad Irfan Khan","Esa Alhoniemi","Elina Kontio","Suleiman A. Khan","Mojtaba Jafaritadi"],"pdf_url":"https://arxiv.org/pdf/2308.00856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04630v3","updated":"2023-08-01T21:58:45Z","published":"2023-03-08T14:48:30Z","title":"Mining the contribution of intensive care clinical course to outcome\n  after traumatic brain injury","summary":"  Existing methods to characterise the evolving condition of traumatic brain\ninjury (TBI) patients in the intensive care unit (ICU) do not capture the\ncontext necessary for individualising treatment. Here, we integrate all\nheterogenous data stored in medical records (1,166 pre-ICU and ICU variables)\nto model the individualised contribution of clinical course to six-month\nfunctional outcome on the Glasgow Outcome Scale - Extended (GOSE). On a\nprospective cohort (n=1,550, 65 centres) of TBI patients, we train recurrent\nneural network models to map a token-embedded time series representation of all\nvariables (including missing values) to an ordinal GOSE prognosis every two\nhours. The full range of variables explains up to 52% (95% CI: 50%-54%) of the\nordinal variance in functional outcome. Up to 91% (95% CI: 90%-91%) of this\nexplanation is derived from pre-ICU and admission information (i.e., static\nvariables). Information collected in the ICU (i.e., dynamic variables)\nincreases explanation (by up to 5% [95% CI: 4%-6%]), though not enough to\ncounter poorer overall performance in longer-stay (>5.75 days) patients.\nHighest-contributing variables include physician-based prognoses, CT features,\nand markers of neurological function. Whilst static information currently\naccounts for the majority of functional outcome explanation after TBI,\ndata-driven analysis highlights investigative avenues to improve dynamic\ncharacterisation of longer-stay patients. Moreover, our modelling strategy\nproves useful for converting large patient records into interpretable time\nseries with missing data integration and minimal processing.\n","authors":["Shubhayu Bhattacharyay","Pier Francesco Caruso","Cecilia Åkerlund","Lindsay Wilson","Robert D Stevens","David K Menon","Ewout W Steyerberg","David W Nelson","Ari Ercole","the CENTER-TBI investigators/participants"],"pdf_url":"https://arxiv.org/pdf/2303.04630v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00855v1","updated":"2023-08-01T21:43:22Z","published":"2023-08-01T21:43:22Z","title":"A Comprehensive Study of Groundbreaking Machine Learning Research:\n  Analyzing Highly Cited and Impactful Publications across Six Decades","summary":"  Machine learning (ML) has emerged as a prominent field of research in\ncomputer science and other related fields, thereby driving advancements in\nother domains of interest. As the field continues to evolve, it is crucial to\nunderstand the landscape of highly cited publications to identify key trends,\ninfluential authors, and significant contributions made thus far. In this\npaper, we present a comprehensive bibliometric analysis of highly cited ML\npublications. We collected a dataset consisting of the top-cited papers from\nreputable ML conferences and journals, covering a period of several years from\n1959 to 2022. We employed various bibliometric techniques to analyze the data,\nincluding citation analysis, co-authorship analysis, keyword analysis, and\npublication trends. Our findings reveal the most influential papers, highly\ncited authors, and collaborative networks within the machine learning\ncommunity. We identify popular research themes and uncover emerging topics that\nhave recently gained significant attention. Furthermore, we examine the\ngeographical distribution of highly cited publications, highlighting the\ndominance of certain countries in ML research. By shedding light on the\nlandscape of highly cited ML publications, our study provides valuable insights\nfor researchers, policymakers, and practitioners seeking to understand the key\ndevelopments and trends in this rapidly evolving field.\n","authors":["Absalom E. Ezugwu","Japie Greeff","Yuh-Shan Ho"],"pdf_url":"https://arxiv.org/pdf/2308.00855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00852v1","updated":"2023-08-01T21:34:45Z","published":"2023-08-01T21:34:45Z","title":"CASSINI: Network-Aware Job Scheduling in Machine Learning Clusters","summary":"  We present CASSINI, a network-aware job scheduler for machine learning (ML)\nclusters. CASSINI introduces a novel geometric abstraction to consider the\ncommunication pattern of different jobs while placing them on network links. To\ndo so, CASSINI uses an affinity graph that finds a series of time-shift values\nto adjust the communication phases of a subset of jobs, such that the\ncommunication patterns of jobs sharing the same network link are interleaved\nwith each other. Experiments with 13 common ML models on a 24-server testbed\ndemonstrate that compared to the state-of-the-art ML schedulers, CASSINI\nimproves the average and tail completion time of jobs by up to 1.6x and 2.5x,\nrespectively. Moreover, we show that CASSINI reduces the number of ECN marked\npackets in the cluster by up to 33x.\n","authors":["Sudarsanan Rajasekaran","Manya Ghobadi","Aditya Akella"],"pdf_url":"https://arxiv.org/pdf/2308.00852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02231v2","updated":"2023-08-01T21:22:42Z","published":"2022-06-05T17:58:02Z","title":"Models of human preference for learning reward functions","summary":"  The utility of reinforcement learning is limited by the alignment of reward\nfunctions with the interests of human stakeholders. One promising method for\nalignment is to learn the reward function from human-generated preferences\nbetween pairs of trajectory segments, a type of reinforcement learning from\nhuman feedback (RLHF). These human preferences are typically assumed to be\ninformed solely by partial return, the sum of rewards along each segment. We\nfind this assumption to be flawed and propose modeling human preferences\ninstead as informed by each segment's regret, a measure of a segment's\ndeviation from optimal decision-making. Given infinitely many preferences\ngenerated according to regret, we prove that we can identify a reward function\nequivalent to the reward function that generated those preferences, and we\nprove that the previous partial return model lacks this identifiability\nproperty in multiple contexts. We empirically show that our proposed regret\npreference model outperforms the partial return preference model with finite\ntraining data in otherwise the same setting. Additionally, we find that our\nproposed regret preference model better predicts real human preferences and\nalso learns reward functions from these preferences that lead to policies that\nare better human-aligned. Overall, this work establishes that the choice of\npreference model is impactful, and our proposed regret preference model\nprovides an improvement upon a core assumption of recent research. We have open\nsourced our experimental code, the human preferences dataset we gathered, and\nour training and preference elicitation interfaces for gathering a such a\ndataset.\n","authors":["W. Bradley Knox","Stephane Hatgis-Kessell","Serena Booth","Scott Niekum","Peter Stone","Alessandro Allievi"],"pdf_url":"https://arxiv.org/pdf/2206.02231v2.pdf","comment":"16 pages (40 pages with references and appendix), 23 figures"},{"id":"http://arxiv.org/abs/2305.13399v5","updated":"2023-08-01T21:01:04Z","published":"2023-05-22T18:25:03Z","title":"Efficient Large-Scale Visual Representation Learning And Evaluation","summary":"  Efficiently learning visual representations of items is vital for large-scale\nrecommendations. In this article we compare several pretrained efficient\nbackbone architectures, both in the convolutional neural network (CNN) and in\nthe vision transformer (ViT) family. We describe challenges in e-commerce\nvision applications at scale and highlight methods to efficiently train,\nevaluate, and serve visual representations. We present ablation studies\nevaluating visual representations in several downstream tasks. To this end, we\npresent a novel multilingual text-to-image generative offline evaluation method\nfor visually similar recommendation systems. Finally, we include online results\nfrom deployed machine learning systems in production on a large scale\ne-commerce platform.\n","authors":["Eden Dolev","Alaa Awad","Denisa Roberts","Zahra Ebrahimzadeh","Marcin Mejran","Vaibhav Malpani","Mahir Yavuz"],"pdf_url":"https://arxiv.org/pdf/2305.13399v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07804v4","updated":"2023-08-01T20:27:56Z","published":"2023-05-12T23:49:23Z","title":"Improving Small Language Models on PubMedQA via Generative Data\n  Augmentation","summary":"  Large Language Models (LLMs) have made remarkable advancements in the field\nof natural language processing. However, their increasing size poses challenges\nin terms of computational cost. On the other hand, Small Language Models (SLMs)\nare known for their efficiency, but they often struggle with limited capacity\nand training data, especially in specific domains. In this paper, we introduce\na novel method aimed at improving SLMs in the medical domain using LLM-based\ngenerative data augmentation. The objective of our approach is to develop more\nefficient and capable models that are specifically tailored for specialized\napplications. Through experiments conducted on the PubMedQA dataset, we\ndemonstrate the effectiveness of LLMs in refining and diversifying existing\nquestion-answer pairs. This refinement process leads to improved performance in\na significantly smaller model after fine-tuning. Notably, our best SLM, with\nunder 1.6 billion parameters, outperforms the few-shot GPT-4 on the PubMedQA\ndataset. Our code and generated data are publicly available to facilitate\nfurther explorations.\n","authors":["Zhen Guo","Peiqi Wang","Yanwei Wang","Shangdi Yu"],"pdf_url":"https://arxiv.org/pdf/2305.07804v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00824v1","updated":"2023-08-01T20:22:53Z","published":"2023-08-01T20:22:53Z","title":"An Exact Kernel Equivalence for Finite Classification Models","summary":"  We explore the equivalence between neural networks and kernel methods by\nderiving the first exact representation of any finite-size parametric\nclassification model trained with gradient descent as a kernel machine. We\ncompare our exact representation to the well-known Neural Tangent Kernel (NTK)\nand discuss approximation error relative to the NTK and other non-exact path\nkernel formulations. We experimentally demonstrate that the kernel can be\ncomputed for realistic networks up to machine precision. We use this exact\nkernel to show that our theoretical contribution can provide useful insights\ninto the predictions made by neural networks, particularly the way in which\nthey generalize.\n","authors":["Brian Bell","Michael Geyer","Juston Moore","David Glickenstein","Amanda Fernandez"],"pdf_url":"https://arxiv.org/pdf/2308.00824v1.pdf","comment":"TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in\n  Appendix"},{"id":"http://arxiv.org/abs/2306.15557v2","updated":"2023-08-01T20:13:08Z","published":"2023-06-27T15:35:22Z","title":"Simple Steps to Success: Axiomatics of Distance-Based Algorithmic\n  Recourse","summary":"  We propose a novel data-driven framework for algorithmic recourse that offers\nusers interventions to change their predicted outcome. Existing approaches to\ncompute recourse find a set of points that satisfy some desiderata -- e.g. an\nintervention in the underlying causal graph, or minimizing a cost function.\nSatisfying these criteria, however, requires extensive knowledge of the\nunderlying model structure, often an unrealistic amount of information in\nseveral domains. We propose a data-driven, computationally efficient approach\nto computing algorithmic recourse. We do so by suggesting directions in the\ndata manifold that users can take to change their predicted outcome. We present\nStepwise Explainable Paths (StEP), an axiomatically justified framework to\ncompute direction-based algorithmic recourse. We offer a thorough empirical and\ntheoretical investigation of StEP. StEP offers provable privacy and robustness\nguarantees, and outperforms the state-of-the-art on several established\nrecourse desiderata.\n","authors":["Jenny Hamer","Jake Valladares","Vignesh Viswanathan","Yair Zick"],"pdf_url":"https://arxiv.org/pdf/2306.15557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00788v1","updated":"2023-08-01T18:59:07Z","published":"2023-08-01T18:59:07Z","title":"An Introduction to Bi-level Optimization: Foundations and Applications\n  in Signal Processing and Machine Learning","summary":"  Recently, bi-level optimization (BLO) has taken center stage in some very\nexciting developments in the area of signal processing (SP) and machine\nlearning (ML). Roughly speaking, BLO is a classical optimization problem that\ninvolves two levels of hierarchy (i.e., upper and lower levels), wherein\nobtaining the solution to the upper-level problem requires solving the\nlower-level one. BLO has become popular largely because it is powerful in\nmodeling problems in SP and ML, among others, that involve optimizing nested\nobjective functions. Prominent applications of BLO range from resource\nallocation for wireless systems to adversarial machine learning. In this work,\nwe focus on a class of tractable BLO problems that often appear in SP and ML\napplications. We provide an overview of some basic concepts of this class of\nBLO problems, such as their optimality conditions, standard algorithms\n(including their optimization principles and practical implementations), as\nwell as how they can be leveraged to obtain state-of-the-art results for a\nnumber of key SP and ML applications. Further, we discuss some recent advances\nin BLO theory, its implications for applications, and point out some\nlimitations of the state-of-the-art that require significant future research\nefforts. Overall, we hope that this article can serve to accelerate the\nadoption of BLO as a generic tool to model, analyze, and innovate on a wide\narray of emerging SP applications.\n","authors":["Yihua Zhang","Prashant Khanduri","Ioannis Tsaknakis","Yuguang Yao","Mingyi Hong","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2308.00788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00787v1","updated":"2023-08-01T18:59:06Z","published":"2023-08-01T18:59:06Z","title":"Evaluating Spiking Neural Network On Neuromorphic Platform For Human\n  Activity Recognition","summary":"  Energy efficiency and low latency are crucial requirements for designing\nwearable AI-empowered human activity recognition systems, due to the hard\nconstraints of battery operations and closed-loop feedback. While neural\nnetwork models have been extensively compressed to match the stringent edge\nrequirements, spiking neural networks and event-based sensing are recently\nemerging as promising solutions to further improve performance due to their\ninherent energy efficiency and capacity to process spatiotemporal data in very\nlow latency. This work aims to evaluate the effectiveness of spiking neural\nnetworks on neuromorphic processors in human activity recognition for wearable\napplications. The case of workout recognition with wrist-worn wearable motion\nsensors is used as a study. A multi-threshold delta modulation approach is\nutilized for encoding the input sensor data into spike trains to move the\npipeline into the event-based approach. The spikes trains are then fed to a\nspiking neural network with direct-event training, and the trained model is\ndeployed on the research neuromorphic platform from Intel, Loihi, to evaluate\nenergy and latency efficiency. Test results show that the spike-based workouts\nrecognition system can achieve a comparable accuracy (87.5\\%) comparable to the\npopular milliwatt RISC-V bases multi-core processor GAP8 with a traditional\nneural network ( 88.1\\%) while achieving two times better energy-delay product\n(0.66 \\si{\\micro\\joule\\second} vs. 1.32 \\si{\\micro\\joule\\second}).\n","authors":["Sizhen Bian","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2308.00787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07229v2","updated":"2023-08-01T18:41:52Z","published":"2022-10-13T17:55:53Z","title":"Mass-Editing Memory in a Transformer","summary":"  Recent work has shown exciting promise in updating large language models with\nnew memories, so as to replace obsolete information or add specialized\nknowledge. However, this line of work is predominantly limited to updating\nsingle associations. We develop MEMIT, a method for directly updating a\nlanguage model with many memories, demonstrating experimentally that it can\nscale up to thousands of associations for GPT-J (6B) and GPT-NeoX (20B),\nexceeding prior work by orders of magnitude. Our code and data are at\nhttps://memit.baulab.info.\n","authors":["Kevin Meng","Arnab Sen Sharma","Alex Andonian","Yonatan Belinkov","David Bau"],"pdf_url":"https://arxiv.org/pdf/2210.07229v2.pdf","comment":"18 pages, 11 figures. Code and data at https://memit.baulab.info"},{"id":"http://arxiv.org/abs/2308.00770v1","updated":"2023-08-01T18:20:05Z","published":"2023-08-01T18:20:05Z","title":"DYMOND: DYnamic MOtif-NoDes Network Generative Model","summary":"  Motifs, which have been established as building blocks for network structure,\nmove beyond pair-wise connections to capture longer-range correlations in\nconnections and activity. In spite of this, there are few generative graph\nmodels that consider higher-order network structures and even fewer that focus\non using motifs in models of dynamic graphs. Most existing generative models\nfor temporal graphs strictly grow the networks via edge addition, and the\nmodels are evaluated using static graph structure metrics -- which do not\nadequately capture the temporal behavior of the network. To address these\nissues, in this work we propose DYnamic MOtif-NoDes (DYMOND) -- a generative\nmodel that considers (i) the dynamic changes in overall graph structure using\ntemporal motif activity and (ii) the roles nodes play in motifs (e.g., one node\nplays the hub role in a wedge, while the remaining two act as spokes). We\ncompare DYMOND to three dynamic graph generative model baselines on real-world\nnetworks and show that DYMOND performs better at generating graph structure and\nnode behavior similar to the observed network. We also propose a new\nmethodology to adapt graph structure metrics to better evaluate the temporal\naspect of the network. These metrics take into account the changes in overall\ngraph structure and the individual nodes' behavior over time.\n","authors":["Giselle Zeno","Timothy La Fond","Jennifer Neville"],"pdf_url":"https://arxiv.org/pdf/2308.00770v1.pdf","comment":"In Proceedings of the Web Conference 2021 (WWW '21)"},{"id":"http://arxiv.org/abs/2308.00762v1","updated":"2023-08-01T18:01:21Z","published":"2023-08-01T18:01:21Z","title":"Self-Supervised Contrastive BERT Fine-tuning for Fusion-based\n  Reviewed-Item Retrieval","summary":"  As natural language interfaces enable users to express increasingly complex\nnatural language queries, there is a parallel explosion of user review content\nthat can allow users to better find items such as restaurants, books, or movies\nthat match these expressive queries. While Neural Information Retrieval (IR)\nmethods have provided state-of-the-art results for matching queries to\ndocuments, they have not been extended to the task of Reviewed-Item Retrieval\n(RIR), where query-review scores must be aggregated (or fused) into item-level\nscores for ranking. In the absence of labeled RIR datasets, we extend Neural IR\nmethodology to RIR by leveraging self-supervised methods for contrastive\nlearning of BERT embeddings for both queries and reviews. Specifically,\ncontrastive learning requires a choice of positive and negative samples, where\nthe unique two-level structure of our item-review data combined with meta-data\naffords us a rich structure for the selection of these samples. For contrastive\nlearning in a Late Fusion scenario, we investigate the use of positive review\nsamples from the same item and/or with the same rating, selection of hard\npositive samples by choosing the least similar reviews from the same anchor\nitem, and selection of hard negative samples by choosing the most similar\nreviews from different items. We also explore anchor sub-sampling and\naugmenting with meta-data. For a more end-to-end Early Fusion approach, we\nintroduce contrastive item embedding learning to fuse reviews into single item\nembeddings. Experimental results show that Late Fusion contrastive learning for\nNeural RIR outperforms all other contrastive IR configurations, Neural IR, and\nsparse retrieval baselines, thus demonstrating the power of exploiting the\ntwo-level structure in Neural RIR approaches as well as the importance of\npreserving the nuance of individual review content via Late Fusion methods.\n","authors":["Mohammad Mahdi Abdollah Pour","Parsa Farinneya","Armin Toroghi","Anton Korikov","Ali Pesaranghader","Touqir Sajed","Manasa Bharadwaj","Borislav Mavrin","Scott Sanner"],"pdf_url":"https://arxiv.org/pdf/2308.00762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00755v1","updated":"2023-08-01T18:00:08Z","published":"2023-08-01T18:00:08Z","title":"The Bias Amplification Paradox in Text-to-Image Generation","summary":"  Bias amplification is a phenomenon in which models increase imbalances\npresent in the training data. In this paper, we study bias amplification in the\ntext-to-image domain using Stable Diffusion by comparing gender ratios in\ntraining vs. generated images. We find that the model appears to amplify\ngender-occupation biases found in the training data (LAION). However, we\ndiscover that amplification can largely be attributed to discrepancies between\ntraining captions and model prompts. For example, an inherent difference is\nthat captions from the training data often contain explicit gender information\nwhile the prompts we use do not, which leads to a distribution shift and\nconsequently impacts bias measures. Once we account for various distributional\ndifferences between texts used for training and generation, we observe that\namplification decreases considerably. Our findings illustrate the challenges of\ncomparing biases in models and the data they are trained on, and highlight\nconfounding factors that contribute to bias amplification.\n","authors":["Preethi Seshadri","Sameer Singh","Yanai Elazar"],"pdf_url":"https://arxiv.org/pdf/2308.00755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00733v1","updated":"2023-08-01T16:59:25Z","published":"2023-08-01T16:59:25Z","title":"Mapping Computer Science Research: Trends, Influences, and Predictions","summary":"  This paper explores the current trending research areas in the field of\nComputer Science (CS) and investigates the factors contributing to their\nemergence. Leveraging a comprehensive dataset comprising papers, citations, and\nfunding information, we employ advanced machine learning techniques, including\nDecision Tree and Logistic Regression models, to predict trending research\nareas. Our analysis reveals that the number of references cited in research\npapers (Reference Count) plays a pivotal role in determining trending research\nareas making reference counts the most relevant factor that drives trend in the\nCS field. Additionally, the influence of NSF grants and patents on trending\ntopics has increased over time. The Logistic Regression model outperforms the\nDecision Tree model in predicting trends, exhibiting higher accuracy,\nprecision, recall, and F1 score. By surpassing a random guess baseline, our\ndata-driven approach demonstrates higher accuracy and efficacy in identifying\ntrending research areas. The results offer valuable insights into the trending\nresearch areas, providing researchers and institutions with a data-driven\nfoundation for decision-making and future research direction.\n","authors":["Mohammed Almutairi","Ozioma Collins Oguine"],"pdf_url":"https://arxiv.org/pdf/2308.00733v1.pdf","comment":"7 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.00725v1","updated":"2023-08-01T15:12:36Z","published":"2023-08-01T15:12:36Z","title":"Latent-Shift: Gradient of Entropy Helps Neural Codecs","summary":"  End-to-end image/video codecs are getting competitive compared to traditional\ncompression techniques that have been developed through decades of manual\nengineering efforts. These trainable codecs have many advantages over\ntraditional techniques such as easy adaptation on perceptual distortion metrics\nand high performance on specific domains thanks to their learning ability.\nHowever, state of the art neural codecs does not take advantage of the\nexistence of gradient of entropy in decoding device. In this paper, we\ntheoretically show that gradient of entropy (available at decoder side) is\ncorrelated with the gradient of the reconstruction error (which is not\navailable at decoder side). We then demonstrate experimentally that this\ngradient can be used on various compression methods, leading to a $1-2\\%$ rate\nsavings for the same quality. Our method is orthogonal to other improvements\nand brings independent rate savings.\n","authors":["Muhammet Balcilar","Bharath Bhushan Damodaran","Karam Naser","Franck Galpin","Pierre Hellier"],"pdf_url":"https://arxiv.org/pdf/2308.00725v1.pdf","comment":"Published to ICIP2023, 6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.01208v1","updated":"2023-08-01T14:27:54Z","published":"2023-08-01T14:27:54Z","title":"Adaptive Collaborative Filtering with Personalized Time Decay Functions\n  for Financial Product Recommendation","summary":"  Classical recommender systems often assume that historical data are\nstationary and fail to account for the dynamic nature of user preferences,\nlimiting their ability to provide reliable recommendations in time-sensitive\nsettings. This assumption is particularly problematic in finance, where\nfinancial products exhibit continuous changes in valuations, leading to\nfrequent shifts in client interests. These evolving interests, summarized in\nthe past client-product interactions, see their utility fade over time with a\ndegree that might differ from one client to another. To address this challenge,\nwe propose a time-dependent collaborative filtering algorithm that can\nadaptively discount distant client-product interactions using personalized\ndecay functions. Our approach is designed to handle the non-stationarity of\nfinancial data and produce reliable recommendations by modeling the dynamic\ncollaborative signals between clients and products. We evaluate our method\nusing a proprietary dataset from BNP Paribas and demonstrate significant\nimprovements over state-of-the-art benchmarks from relevant literature. Our\nfindings emphasize the importance of incorporating time explicitly in the model\nto enhance the accuracy of financial product recommendation.\n","authors":["Ashraf Ghiye","Baptiste Barreau","Laurent Carlier","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2308.01208v1.pdf","comment":"10 pages, 1 figure, 2 tables, to be published in the Seventeenth ACM\n  Conference on Recommender Systems (RecSys '23)"},{"id":"http://arxiv.org/abs/2211.14220v2","updated":"2023-08-01T13:53:54Z","published":"2022-11-25T16:32:44Z","title":"Data-driven identification and analysis of the glass transition in\n  polymer melts","summary":"  Understanding the nature of glass transition, as well as precise estimation\nof the glass transition temperature for polymeric materials, remain open\nquestions in both experimental and theoretical polymer sciences. We propose a\ndata-driven approach, which utilizes the high-resolution details accessible\nthrough the molecular dynamics simulation and considers the structural\ninformation of individual chains. It clearly identifies the glass transition\ntemperature of polymer melts of weakly semiflexible chains. By combining\nprincipal component analysis and clustering, we identify the glass transition\ntemperature in the asymptotic limit even from relatively short-time\ntrajectories, which just reach into the Rouse-like monomer displacement regime.\nWe demonstrate that fluctuations captured by the principal component analysis\nreflect the change in a chain's behaviour: from conformational rearrangement\nabove to small rearrangements below the glass transition temperature. Our\napproach is straightforward to apply, and should be applicable to other\npolymeric glass-forming liquids.\n","authors":["Atreyee Banerjee","Hsiao-Ping Hsu","Kurt Kremer","Oleksandra Kukharenko"],"pdf_url":"https://arxiv.org/pdf/2211.14220v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.04192v3","updated":"2023-08-01T15:05:01Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v3.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00588v1","updated":"2023-08-01T15:04:56Z","published":"2023-08-01T15:04:56Z","title":"Relation-Aware Distribution Representation Network for Person Clustering\n  with Multiple Modalities","summary":"  Person clustering with multi-modal clues, including faces, bodies, and\nvoices, is critical for various tasks, such as movie parsing and identity-based\nmovie editing. Related methods such as multi-view clustering mainly project\nmulti-modal features into a joint feature space. However, multi-modal clue\nfeatures are usually rather weakly correlated due to the semantic gap from the\nmodality-specific uniqueness. As a result, these methods are not suitable for\nperson clustering. In this paper, we propose a Relation-Aware Distribution\nrepresentation Network (RAD-Net) to generate a distribution representation for\nmulti-modal clues. The distribution representation of a clue is a vector\nconsisting of the relation between this clue and all other clues from all\nmodalities, thus being modality agnostic and good for person clustering.\nAccordingly, we introduce a graph-based method to construct distribution\nrepresentation and employ a cyclic update policy to refine distribution\nrepresentation progressively. Our method achieves substantial improvements of\n+6% and +8.2% in F-score on the Video Person-Clustering Dataset (VPCD) and\nVoxCeleb2 multi-view clustering dataset, respectively. Codes will be released\npublicly upon acceptance.\n","authors":["Kaijian Liu","Shixiang Tang","Ziyue Li","Zhishuai Li","Lei Bai","Feng Zhu","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.00588v1.pdf","comment":"Accepted in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2305.06321v2","updated":"2023-08-01T12:57:14Z","published":"2023-05-10T17:15:09Z","title":"SepMark: Deep Separable Watermarking for Unified Source Tracing and\n  Deepfake Detection","summary":"  Malicious Deepfakes have led to a sharp conflict over distinguishing between\ngenuine and forged faces. Although many countermeasures have been developed to\ndetect Deepfakes ex-post, undoubtedly, passive forensics has not considered any\npreventive measures for the pristine face before foreseeable manipulations. To\ncomplete this forensics ecosystem, we thus put forward the proactive solution\ndubbed SepMark, which provides a unified framework for source tracing and\nDeepfake detection. SepMark originates from encoder-decoder-based deep\nwatermarking but with two separable decoders. For the first time the deep\nseparable watermarking, SepMark brings a new paradigm to the established study\nof deep watermarking, where a single encoder embeds one watermark elegantly,\nwhile two decoders can extract the watermark separately at different levels of\nrobustness. The robust decoder termed Tracer that resists various distortions\nmay have an overly high level of robustness, allowing the watermark to survive\nboth before and after Deepfake. The semi-robust one termed Detector is\nselectively sensitive to malicious distortions, making the watermark disappear\nafter Deepfake. Only SepMark comprising of Tracer and Detector can reliably\ntrace the trusted source of the marked face and detect whether it has been\naltered since being marked; neither of the two alone can achieve this.\nExtensive experiments demonstrate the effectiveness of the proposed SepMark on\ntypical Deepfakes, including face swapping, expression reenactment, and\nattribute editing.\n","authors":["Xiaoshuai Wu","Xin Liao","Bo Ou"],"pdf_url":"https://arxiv.org/pdf/2305.06321v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.00462v1","updated":"2023-08-01T11:38:50Z","published":"2023-08-01T11:38:50Z","title":"Context-Aware Talking-Head Video Editing","summary":"  Talking-head video editing aims to efficiently insert, delete, and substitute\nthe word of a pre-recorded video through a text transcript editor. The key\nchallenge for this task is obtaining an editing model that generates new\ntalking-head video clips which simultaneously have accurate lip synchronization\nand motion smoothness. Previous approaches, including 3DMM-based (3D Morphable\nModel) methods and NeRF-based (Neural Radiance Field) methods, are sub-optimal\nin that they either require minutes of source videos and days of training time\nor lack the disentangled control of verbal (e.g., lip motion) and non-verbal\n(e.g., head pose and expression) representations for video clip insertion. In\nthis work, we fully utilize the video context to design a novel framework for\ntalking-head video editing, which achieves efficiency, disentangled motion\ncontrol, and sequential smoothness. Specifically, we decompose this framework\nto motion prediction and motion-conditioned rendering: (1) We first design an\nanimation prediction module that efficiently obtains smooth and lip-sync motion\nsequences conditioned on the driven speech. This module adopts a\nnon-autoregressive network to obtain context prior and improve the prediction\nefficiency, and it learns a speech-animation mapping prior with better\ngeneralization to novel speech from a multi-identity video dataset. (2) We then\nintroduce a neural rendering module to synthesize the photo-realistic and\nfull-head video frames given the predicted motion sequence. This module adopts\na pre-trained head topology and uses only few frames for efficient fine-tuning\nto obtain a person-specific rendering model. Extensive experiments demonstrate\nthat our method efficiently achieves smoother editing results with higher image\nquality and lip accuracy using less data than previous methods.\n","authors":["Songlin Yang","Wei Wang","Jun Ling","Bo Peng","Xu Tan","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2308.00462v1.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n  Multimedia (MM' 23),"},{"id":"http://arxiv.org/abs/2305.08698v2","updated":"2023-08-01T10:23:20Z","published":"2023-05-15T14:58:28Z","title":"Continual Multimodal Knowledge Graph Construction","summary":"  Multimodal Knowledge Graph Construction (MKGC) involves creating structured\nrepresentations of entities and relations using multiple modalities, such as\ntext and images. However, existing MKGC models face challenges in handling the\naddition of new entities and relations in dynamic real-world scenarios. The\ncurrent continual setting for knowledge graph construction mainly focuses on\nentity and relation extraction from text data, overlooking other multimodal\nsources. Therefore, there arises the need to explore the challenge of continual\nMKGC to address the phenomenon of catastrophic forgetting and ensure the\nretention of past knowledge extracted from different forms of data. This\nresearch focuses on investigating this complex topic by developing lifelong\nMKGC benchmark datasets. Based on the empirical findings that several typical\nMKGC models, when trained on multimedia data, might unexpectedly underperform\ncompared to those solely utilizing textual resources in a continual setting, we\npropose a Lifelong MultiModal Consistent Transformer Framework (LMC) for\ncontinual MKGC, which plays the strengths of the consistent multimodal\noptimization in continual learning and leads to a better stability-plasticity\ntrade-off. Our experiments demonstrate the superior performance of our method\nover prevailing continual learning techniques or multimodal approaches in\ndynamic scenarios. Code and datasets can be found at\nhttps://github.com/zjunlp/ContinueMKGC.\n","authors":["Xiang Chen","Ningyu Zhang","Jintian Zhang","Xiaohan Wang","Tongtong Wu","Xi Chen","Yongheng Wang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2305.08698v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.00401v1","updated":"2023-08-01T09:28:48Z","published":"2023-08-01T09:28:48Z","title":"VideoPro: A Visual Analytics Approach for Interactive Video Programming","summary":"  Constructing supervised machine learning models for real-world video analysis\nrequire substantial labeled data, which is costly to acquire due to scarce\ndomain expertise and laborious manual inspection. While data programming shows\npromise in generating labeled data at scale with user-defined labeling\nfunctions, the high dimensional and complex temporal information in videos\nposes additional challenges for effectively composing and evaluating labeling\nfunctions. In this paper, we propose VideoPro, a visual analytics approach to\nsupport flexible and scalable video data programming for model steering with\nreduced human effort. We first extract human-understandable events from videos\nusing computer vision techniques and treat them as atomic components of\nlabeling functions. We further propose a two-stage template mining algorithm\nthat characterizes the sequential patterns of these events to serve as labeling\nfunction templates for efficient data labeling. The visual interface of\nVideoPro facilitates multifaceted exploration, examination, and application of\nthe labeling templates, allowing for effective programming of video data at\nscale. Moreover, users can monitor the impact of programming on model\nperformance and make informed adjustments during the iterative programming\nprocess. We demonstrate the efficiency and effectiveness of our approach with\ntwo case studies and expert interviews.\n","authors":["Jianben He","Xingbo Wang","Kam Kwai Wong","Xijie Huang","Changjian Chen","Zixin Chen","Fengjie Wang","Min Zhu","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2308.00401v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00400v1","updated":"2023-08-01T09:28:36Z","published":"2023-08-01T09:28:36Z","title":"ZRIGF: An Innovative Multimodal Framework for Zero-Resource\n  Image-Grounded Dialogue Generation","summary":"  Image-grounded dialogue systems benefit greatly from integrating visual\ninformation, resulting in high-quality response generation. However, current\nmodels struggle to effectively utilize such information in zero-resource\nscenarios, mainly due to the disparity between image and text modalities. To\novercome this challenge, we propose an innovative multimodal framework, called\nZRIGF, which assimilates image-grounded information for dialogue generation in\nzero-resource situations. ZRIGF implements a two-stage learning strategy,\ncomprising contrastive pre-training and generative pre-training. Contrastive\npre-training includes a text-image matching module that maps images and texts\ninto a unified encoded vector space, along with a text-assisted masked image\nmodeling module that preserves pre-training visual features and fosters further\nmultimodal feature alignment. Generative pre-training employs a multimodal\nfusion module and an information transfer module to produce insightful\nresponses based on harmonized multimodal representations. Comprehensive\nexperiments conducted on both text-based and image-grounded dialogue datasets\ndemonstrate ZRIGF's efficacy in generating contextually pertinent and\ninformative responses. Furthermore, we adopt a fully zero-resource scenario in\nthe image-grounded dialogue dataset to demonstrate our framework's robust\ngeneralization capabilities in novel domains. The code is available at\nhttps://github.com/zhangbo-nlp/ZRIGF.\n","authors":["Bo Zhang","Jian Wang","Hui Ma","Bo Xu","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.00400v1.pdf","comment":"ACM Multimedia 2023 Accpeted, Repo:\n  https://github.com/zhangbo-nlp/ZRIGF"},{"id":"http://arxiv.org/abs/2211.06607v2","updated":"2023-08-01T07:50:19Z","published":"2022-11-12T08:10:35Z","title":"Few-shot Multimodal Sentiment Analysis based on Multimodal Probabilistic\n  Fusion Prompts","summary":"  Multimodal sentiment analysis has gained significant attention due to the\nproliferation of multimodal content on social media. However, existing studies\nin this area rely heavily on large-scale supervised data, which is\ntime-consuming and labor-intensive to collect. Thus, there is a need to address\nthe challenge of few-shot multimodal sentiment analysis. To tackle this\nproblem, we propose a novel method called Multimodal Probabilistic Fusion\nPrompts (MultiPoint) that leverages diverse cues from different modalities for\nmultimodal sentiment detection in the few-shot scenario. Specifically, we start\nby introducing a Consistently Distributed Sampling approach called CDS, which\nensures that the few-shot dataset has the same category distribution as the\nfull dataset. Unlike previous approaches primarily using prompts based on the\ntext modality, we design unified multimodal prompts to reduce discrepancies\nbetween different modalities and dynamically incorporate multimodal\ndemonstrations into the context of each multimodal instance. To enhance the\nmodel's robustness, we introduce a probabilistic fusion method to fuse output\npredictions from multiple diverse prompts for each input. Our extensive\nexperiments on six datasets demonstrate the effectiveness of our approach.\nFirst, our method outperforms strong baselines in the multimodal few-shot\nsetting. Furthermore, under the same amount of data (1% of the full dataset),\nour CDS-based experimental results significantly outperform those based on\npreviously sampled datasets constructed from the same number of instances of\neach class.\n","authors":["Xiaocui Yang","Shi Feng","Daling Wang","Pengfei Hong","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2211.06607v2.pdf","comment":"9 pages, 2 figures, 7 tables. It has been accepted ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.16210v2","updated":"2023-08-01T05:35:51Z","published":"2023-07-30T12:16:49Z","title":"Rethinking Uncertainly Missing and Ambiguous Visual Modality in\n  Multi-Modal Entity Alignment","summary":"  As a crucial extension of entity alignment (EA), multi-modal entity alignment\n(MMEA) aims to identify identical entities across disparate knowledge graphs\n(KGs) by exploiting associated visual information. However, existing MMEA\napproaches primarily concentrate on the fusion paradigm of multi-modal entity\nfeatures, while neglecting the challenges presented by the pervasive phenomenon\nof missing and intrinsic ambiguity of visual images. In this paper, we present\na further analysis of visual modality incompleteness, benchmarking latest MMEA\nmodels on our proposed dataset MMEA-UMVM, where the types of alignment KGs\ncovering bilingual and monolingual, with standard (non-iterative) and iterative\ntraining paradigms to evaluate the model performance. Our research indicates\nthat, in the face of modality incompleteness, models succumb to overfitting the\nmodality noise, and exhibit performance oscillations or declines at high rates\nof missing modality. This proves that the inclusion of additional multi-modal\ndata can sometimes adversely affect EA. To address these challenges, we\nintroduce UMAEA , a robust multi-modal entity alignment approach designed to\ntackle uncertainly missing and ambiguous visual modalities. It consistently\nachieves SOTA performance across all 97 benchmark splits, significantly\nsurpassing existing baselines with limited parameters and time consumption,\nwhile effectively alleviating the identified limitations of other models. Our\ncode and benchmark data are available at https://github.com/zjukg/UMAEA.\n","authors":["Zhuo Chen","Lingbing Guo","Yin Fang","Yichi Zhang","Jiaoyan Chen","Jeff Z. Pan","Yangning Li","Huajun Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.16210v2.pdf","comment":"International Semantic Web Conference '23 (ISWC 2023),\n  https://github.com/zjukg/UMAEA"},{"id":"http://arxiv.org/abs/2308.00264v1","updated":"2023-08-01T03:54:27Z","published":"2023-08-01T03:54:27Z","title":"Multi-Modality Multi-Loss Fusion Network","summary":"  In this work we investigate the optimal selection and fusion of features\nacross multiple modalities and combine these in a neural network to improve\nemotion detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nuseful findings relating to subnet performance. Our best model achieves\nstate-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and\nCH-SIMS), and outperforms the other methods in most metrics. We have found that\ntraining on multimodal features improves single modality testing and designing\nfusion methods based on dataset annotation schema enhances model performance.\nThese results suggest a roadmap towards an optimized feature selection and\nfusion approach for enhancing emotion detection in neural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v1.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2308.01925v1","updated":"2023-08-01T18:39:12Z","published":"2023-08-01T18:39:12Z","title":"Accessibility and Inclusiveness of New Information and Communication\n  Technologies for Disabled Users and Content Creators in the Metaverse","summary":"  Despite the proliferation of Blockchain Metaverse projects, the inclusion of\nphysically disabled individuals in the Metaverse remains distant, with limited\nstandards and regulations in place. However, the article proposes a concept of\nthe Metaverse that leverages emerging technologies, such as Virtual and\nAugmented Reality, and the Internet of Things, to enable greater engagement of\ndisabled creatives. This approach aims to enhance inclusiveness in the\nMetaverse landscape. Based on the findings, the paper concludes that the active\ninvolvement of physically disabled individuals in the design and development of\nMetaverse platforms is crucial for promoting inclusivity. The proposed\nframework for accessibility and inclusiveness in Virtual, Augmented, and Mixed\nrealities of decentralised Metaverses provides a basis for the meaningful\nparticipation of disabled creatives. The article emphasises the importance of\naddressing the mechanisms for art production by individuals with disabilities\nin the emerging Metaverse landscape. Additionally, it highlights the need for\nfurther research and collaboration to establish standards and regulations that\nfacilitate the inclusion of physically disabled individuals in Metaverse\nprojects.\n","authors":["Dr Petar Radanliev","Professor David De Roure","Dr Peter Novitzky","Dr Ivo Sluganovic"],"pdf_url":"https://arxiv.org/pdf/2308.01925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03741v1","updated":"2023-08-01T11:00:25Z","published":"2023-08-01T11:00:25Z","title":"MAiVAR-T: Multimodal Audio-image and Video Action Recognizer using\n  Transformers","summary":"  In line with the human capacity to perceive the world by simultaneously\nprocessing and integrating high-dimensional inputs from multiple modalities\nlike vision and audio, we propose a novel model, MAiVAR-T (Multimodal\nAudio-Image to Video Action Recognition Transformer). This model employs an\nintuitive approach for the combination of audio-image and video modalities,\nwith a primary aim to escalate the effectiveness of multimodal human action\nrecognition (MHAR). At the core of MAiVAR-T lies the significance of distilling\nsubstantial representations from the audio modality and transmuting these into\nthe image domain. Subsequently, this audio-image depiction is fused with the\nvideo modality to formulate a unified representation. This concerted approach\nstrives to exploit the contextual richness inherent in both audio and video\nmodalities, thereby promoting action recognition. In contrast to existing\nstate-of-the-art strategies that focus solely on audio or video modalities,\nMAiVAR-T demonstrates superior performance. Our extensive empirical evaluations\nconducted on a benchmark action recognition dataset corroborate the model's\nremarkable performance. This underscores the potential enhancements derived\nfrom integrating audio and video modalities for action recognition purposes.\n","authors":["Muhammad Bilal Shaikh","Douglas Chai","Syed Mohammed Shamsul Islam","Naveed Akhtar"],"pdf_url":"https://arxiv.org/pdf/2308.03741v1.pdf","comment":"6 pages, 7 figures, 4 tables, Peer reviewed, Accepted @ The 11th\n  European Workshop on Visual Information Processing (EUVIP) will be held on\n  11th-14th September 2023, in Gj{\\o}vik, Norway. arXiv admin note: text\n  overlap with arXiv:2103.15691 by other authors"}]},"2023-08-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.01313v1","updated":"2023-08-02T17:57:25Z","published":"2023-08-02T17:57:25Z","title":"More Context, Less Distraction: Visual Classification by Inferring and\n  Conditioning on Contextual Attributes","summary":"  CLIP, as a foundational vision language model, is widely used in zero-shot\nimage classification due to its ability to understand various visual concepts\nand natural language descriptions. However, how to fully leverage CLIP's\nunprecedented human-like understanding capabilities to achieve better zero-shot\nclassification is still an open question. This paper draws inspiration from the\nhuman visual perception process: a modern neuroscience view suggests that in\nclassifying an object, humans first infer its class-independent attributes\n(e.g., background and orientation) which help separate the foreground object\nfrom the background, and then make decisions based on this information.\nInspired by this, we observe that providing CLIP with contextual attributes\nimproves zero-shot classification and mitigates reliance on spurious features.\nWe also observe that CLIP itself can reasonably infer the attributes from an\nimage. With these observations, we propose a training-free, two-step zero-shot\nclassification method named PerceptionCLIP. Given an image, it first infers\ncontextual attributes (e.g., background) and then performs object\nclassification conditioning on them. Our experiments show that PerceptionCLIP\nachieves better generalization, group robustness, and better interpretability.\nFor example, PerceptionCLIP with ViT-L/14 improves the worst group accuracy by\n16.5% on the Waterbirds dataset and by 3.5% on CelebA.\n","authors":["Bang An","Sicheng Zhu","Michael-Andrei Panaitescu-Liess","Chaithanya Kumar Mummadi","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2308.01313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15550v2","updated":"2023-08-02T17:53:45Z","published":"2023-06-27T15:23:14Z","title":"CamemBERT-bio: a Tasty French Language Model Better for your Health","summary":"  Clinical data in hospitals are increasingly accessible for research through\nclinical data warehouses, however these documents are unstructured. It is\ntherefore necessary to extract information from medical reports to conduct\nclinical studies. Transfer learning with BERT-like models such as CamemBERT has\nallowed major advances, especially for named entity recognition. However, these\nmodels are trained for plain language and are less efficient on biomedical\ndata. This is why we propose a new French public biomedical dataset on which we\nhave continued the pre-training of CamemBERT. Thus, we introduce a first\nversion of CamemBERT-bio, a specialized public model for the French biomedical\ndomain that shows 2.54 points of F1 score improvement on average on different\nbiomedical named entity recognition tasks. Our findings demonstrate the success\nof continual pre-training from a French model and contrast with recent\nproposals on the same domain and language. One of our key contributions\nhighlights the importance of using a standard evaluation protocol that enables\na clear view of the current state-of-the-art for French biomedical models.\n","authors":["Rian Touchent","Laurent Romary","Eric de la Clergerie"],"pdf_url":"https://arxiv.org/pdf/2306.15550v2.pdf","comment":"refined the terminology used for methodologies, providing more\n  explicit and descriptive labels; expanded the arguments about methodology in\n  the paper, offering a more comprehensive discussion and exploration of the\n  topic; results unchanged"},{"id":"http://arxiv.org/abs/2302.00102v3","updated":"2023-08-02T17:16:48Z","published":"2023-01-31T21:08:58Z","title":"Towards Detecting Harmful Agendas in News Articles","summary":"  Manipulated news online is a growing problem which necessitates the use of\nautomated systems to curtail its spread. We argue that while misinformation and\ndisinformation detection have been studied, there has been a lack of investment\nin the important open challenge of detecting harmful agendas in news articles;\nidentifying harmful agendas is critical to flag news campaigns with the\ngreatest potential for real world harm. Moreover, due to real concerns around\ncensorship, harmful agenda detectors must be interpretable to be effective. In\nthis work, we propose this new task and release a dataset, NewsAgendas, of\nannotated news articles for agenda identification. We show how interpretable\nsystems can be effective on this task and demonstrate that they can perform\ncomparably to black-box models.\n","authors":["Melanie Subbiah","Amrita Bhattacharjee","Yilun Hua","Tharindu Kumarage","Huan Liu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2302.00102v3.pdf","comment":"Camera-ready for ACL-WASSA 2023. First two authors contributed\n  equally"},{"id":"http://arxiv.org/abs/2308.01284v1","updated":"2023-08-02T17:11:37Z","published":"2023-08-02T17:11:37Z","title":"Fighting Fire with Fire: Can ChatGPT Detect AI-generated Text?","summary":"  Large language models (LLMs) such as ChatGPT are increasingly being used for\nvarious use cases, including text content generation at scale. Although\ndetection methods for such AI-generated text exist already, we investigate\nChatGPT's performance as a detector on such AI-generated text, inspired by\nworks that use ChatGPT as a data labeler or annotator. We evaluate the\nzero-shot performance of ChatGPT in the task of human-written vs. AI-generated\ntext detection, and perform experiments on publicly available datasets. We\nempirically investigate if ChatGPT is symmetrically effective in detecting\nAI-generated or human-written text. Our findings provide insight on how ChatGPT\nand similar LLMs may be leveraged in automated detection pipelines by simply\nfocusing on solving a specific aspect of the problem and deriving the rest from\nthat solution. All code and data is available at\n\\url{https://github.com/AmritaBh/ChatGPT-as-Detector}.\n","authors":["Amrita Bhattacharjee","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.01284v1.pdf","comment":"to appear in SIGKDD Explorations"},{"id":"http://arxiv.org/abs/2308.01264v1","updated":"2023-08-02T16:36:58Z","published":"2023-08-02T16:36:58Z","title":"Exploring the psychology of GPT-4's Moral and Legal Reasoning","summary":"  Large language models have been used as the foundation of highly\nsophisticated artificial intelligences, capable of delivering human-like\nresponses to probes about legal and moral issues. However, these models are\nunreliable guides to their own inner workings, and even the engineering teams\nbehind their creation are unable to explain exactly how they came to develop\nall of the capabilities they currently have. The emerging field of machine\npsychology seeks to gain insight into the processes and concepts that these\nmodels possess. In this paper, we employ the methods of psychology to probe\ninto GPT-4's moral and legal reasoning. More specifically, we investigate the\nsimilarities and differences between GPT-4 and humans when it comes to\nintentionality ascriptions, judgments about causation, the morality of\ndeception, moral foundations, the impact of moral luck on legal judgments, the\nconcept of consent, and rule violation judgments. We find high correlations\nbetween human and AI responses, but also several significant systematic\ndifferences between them. We conclude with a discussion of the philosophical\nimplications of our findings.\n","authors":["Guilherme F. C. F. Almeida","José Luiz Nunes","Neele Engelmann","Alex Wiegmann","Marcelo de Araújo"],"pdf_url":"https://arxiv.org/pdf/2308.01264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01263v1","updated":"2023-08-02T16:30:40Z","published":"2023-08-02T16:30:40Z","title":"XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in\n  Large Language Models","summary":"  Without proper safeguards, large language models will readily follow\nmalicious instructions and generate toxic content. This motivates safety\nefforts such as red-teaming and large-scale feedback learning, which aim to\nmake models both helpful and harmless. However, there is a tension between\nthese two objectives, since harmlessness requires models to refuse complying\nwith unsafe prompts, and thus not be helpful. Recent anecdotal evidence\nsuggests that some models may have struck a poor balance, so that even clearly\nsafe prompts are refused if they use similar language to unsafe prompts or\nmention sensitive topics. In this paper, we introduce a new test suite called\nXSTest to identify such eXaggerated Safety behaviours in a structured and\nsystematic way. In its current form, XSTest comprises 200 safe prompts across\nten prompt types that well-calibrated models should not refuse to comply with.\nWe describe XSTest's creation and composition, and use the test suite to\nhighlight systematic failure modes in a recently-released state-of-the-art\nlanguage model.\n","authors":["Paul Röttger","Hannah Rose Kirk","Bertie Vidgen","Giuseppe Attanasio","Federico Bianchi","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2308.01263v1.pdf","comment":"v1 to document initial data release"},{"id":"http://arxiv.org/abs/2308.01240v1","updated":"2023-08-02T15:54:22Z","published":"2023-08-02T15:54:22Z","title":"Evaluating Instruction-Tuned Large Language Models on Code Comprehension\n  and Generation","summary":"  In this work, we evaluate 10 open-source instructed LLMs on four\nrepresentative code comprehension and generation tasks. We have the following\nmain findings. First, for the zero-shot setting, instructed LLMs are very\ncompetitive on code comprehension and generation tasks and sometimes even\nbetter than small SOTA models specifically fine-tuned on each downstream task.\nWe also find that larger instructed LLMs are not always better on code-related\ntasks. Second, for the few-shot setting, we find that adding demonstration\nexamples substantially helps instructed LLMs perform better on most code\ncomprehension and generation tasks; however, the examples would sometimes\ninduce unstable or even worse performance. Furthermore, we find widely-used\nBM25-based shot selection strategy significantly outperforms the basic random\nselection or fixed selection only on generation problems. Third, for the\nfine-tuning setting, we find that fine-tuning could further improve the model\nperformance on downstream code comprehension and generation tasks compared to\nthe zero-shot/one-shot performance. In addition, after being fine-tuned on the\nsame downstream task dataset, instructed LLMs outperform both the small SOTA\nmodels and similar-scaled LLMs without instruction tuning. Based on our\nfindings, we further present practical implications on model and usage\nrecommendation, performance and cost trade-offs, and future direction.\n","authors":["Zhiqiang Yuan","Junwei Liu","Qiancheng Zi","Mingwei Liu","Xin Peng","Yiling Lou"],"pdf_url":"https://arxiv.org/pdf/2308.01240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01236v1","updated":"2023-08-02T15:44:36Z","published":"2023-08-02T15:44:36Z","title":"Grounded Image Text Matching with Mismatched Relation Reasoning","summary":"  This paper introduces Grounded Image Text Matching with Mismatched Relation\n(GITM-MR), a novel visual-linguistic joint task that evaluates the relation\nunderstanding capabilities of transformer-based pre-trained models. GITM-MR\nrequires a model to first determine if an expression describes an image, then\nlocalize referred objects or ground the mismatched parts of the text. We\nprovide a benchmark for evaluating pre-trained models on this task, with a\nfocus on the challenging settings of limited data and out-of-distribution\nsentence lengths. Our evaluation demonstrates that pre-trained models lack data\nefficiency and length generalization ability. To address this, we propose the\nRelation-sensitive Correspondence Reasoning Network (RCRN), which incorporates\nrelation-aware reasoning via bi-directional message propagation guided by\nlanguage structure. RCRN can be interpreted as a modular program and delivers\nstrong performance in both length generalization and data efficiency.\n","authors":["Yu Wu","Yana Wei","Haozhe Wang","Yongfei Liu","Sibei Yang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2308.01236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01223v1","updated":"2023-08-02T15:29:22Z","published":"2023-08-02T15:29:22Z","title":"Do Multilingual Language Models Think Better in English?","summary":"  Translate-test is a popular technique to improve the performance of\nmultilingual language models. This approach works by translating the input into\nEnglish using an external machine translation system, and running inference\nover the translated input. However, these improvements can be attributed to the\nuse of a separate translation system, which is typically trained on large\namounts of parallel data not seen by the language model. In this work, we\nintroduce a new approach called self-translate, which overcomes the need of an\nexternal translation system by leveraging the few-shot translation capabilities\nof multilingual language models. Experiments over 5 tasks show that\nself-translate consistently outperforms direct inference, demonstrating that\nlanguage models are unable to leverage their full multilingual potential when\nprompted in non-English languages. Our code is available at\nhttps://github.com/juletx/self-translate.\n","authors":["Julen Etxaniz","Gorka Azkune","Aitor Soroa","Oier Lopez de Lacalle","Mikel Artetxe"],"pdf_url":"https://arxiv.org/pdf/2308.01223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01210v1","updated":"2023-08-02T15:12:56Z","published":"2023-08-02T15:12:56Z","title":"Global Hierarchical Neural Networks using Hierarchical Softmax","summary":"  This paper presents a framework in which hierarchical softmax is used to\ncreate a global hierarchical classifier. The approach is applicable for any\nclassification task where there is a natural hierarchy among classes. We show\nempirical results on four text classification datasets. In all datasets the\nhierarchical softmax improved on the regular softmax used in a flat classifier\nin terms of macro-F1 and macro-recall. In three out of four datasets\nhierarchical softmax achieved a higher micro-accuracy and macro-precision.\n","authors":["Jetze Schuurmans","Flavius Frasincar"],"pdf_url":"https://arxiv.org/pdf/2308.01210v1.pdf","comment":"Submitted to the 35th Symposium on Applied Computing (SAC'20,\n  https://www.sigapp.org/sac/sac2020/), to the Machine Learning and its\n  Applications track (MLA, https://sites.google.com/view/acmsac2020/)"},{"id":"http://arxiv.org/abs/2308.01154v1","updated":"2023-08-02T13:58:37Z","published":"2023-08-02T13:58:37Z","title":"Arithmetic with Language Models: from Memorization to Computation","summary":"  A better understanding of the emergent computation and problem-solving\ncapabilities of recent large language models is of paramount importance to\nfurther improve them and broaden their applicability. This work investigates\nhow a language model, trained to predict the next token, can perform arithmetic\ncomputations generalizing beyond training data. Binary addition and\nmultiplication constitute a good testbed for this purpose, since they require a\nvery small vocabulary and exhibit relevant input/output discontinuities making\nsmooth input interpolation ineffective for novel data. We successfully trained\na light language model to learn these tasks and ran a number of experiments to\ninvestigate the extrapolation capabilities and internal information processing.\nOur findings support the hypotheses that the language model works as an\nEncoding-Regression-Decoding machine where the computation takes place in the\nvalue space once the input token representation is mapped to an appropriate\ninternal representation.\n","authors":["Davide Maltoni","Matteo Ferrara"],"pdf_url":"https://arxiv.org/pdf/2308.01154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00400v2","updated":"2023-08-02T13:53:30Z","published":"2023-08-01T09:28:36Z","title":"ZRIGF: An Innovative Multimodal Framework for Zero-Resource\n  Image-Grounded Dialogue Generation","summary":"  Image-grounded dialogue systems benefit greatly from integrating visual\ninformation, resulting in high-quality response generation. However, current\nmodels struggle to effectively utilize such information in zero-resource\nscenarios, mainly due to the disparity between image and text modalities. To\novercome this challenge, we propose an innovative multimodal framework, called\nZRIGF, which assimilates image-grounded information for dialogue generation in\nzero-resource situations. ZRIGF implements a two-stage learning strategy,\ncomprising contrastive pre-training and generative pre-training. Contrastive\npre-training includes a text-image matching module that maps images and texts\ninto a unified encoded vector space, along with a text-assisted masked image\nmodeling module that preserves pre-training visual features and fosters further\nmultimodal feature alignment. Generative pre-training employs a multimodal\nfusion module and an information transfer module to produce insightful\nresponses based on harmonized multimodal representations. Comprehensive\nexperiments conducted on both text-based and image-grounded dialogue datasets\ndemonstrate ZRIGF's efficacy in generating contextually pertinent and\ninformative responses. Furthermore, we adopt a fully zero-resource scenario in\nthe image-grounded dialogue dataset to demonstrate our framework's robust\ngeneralization capabilities in novel domains. The code is available at\nhttps://github.com/zhangbo-nlp/ZRIGF.\n","authors":["Bo Zhang","Jian Wang","Hui Ma","Bo Xu","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.00400v2.pdf","comment":"ACM Multimedia 2023 Accpeted, Repo:\n  https://github.com/zhangbo-nlp/ZRIGF"},{"id":"http://arxiv.org/abs/2308.01143v1","updated":"2023-08-02T13:33:20Z","published":"2023-08-02T13:33:20Z","title":"ADS-Cap: A Framework for Accurate and Diverse Stylized Captioning with\n  Unpaired Stylistic Corpora","summary":"  Generating visually grounded image captions with specific linguistic styles\nusing unpaired stylistic corpora is a challenging task, especially since we\nexpect stylized captions with a wide variety of stylistic patterns. In this\npaper, we propose a novel framework to generate Accurate and Diverse Stylized\nCaptions (ADS-Cap). Our ADS-Cap first uses a contrastive learning module to\nalign the image and text features, which unifies paired factual and unpaired\nstylistic corpora during the training process. A conditional variational\nauto-encoder is then used to automatically memorize diverse stylistic patterns\nin latent space and enhance diversity through sampling. We also design a simple\nbut effective recheck module to boost style accuracy by filtering\nstyle-specific captions. Experimental results on two widely used stylized image\ncaptioning datasets show that regarding consistency with the image, style\naccuracy and diversity, ADS-Cap achieves outstanding performances compared to\nvarious baselines. We finally conduct extensive analyses to understand the\neffectiveness of our method. Our code is available at\nhttps://github.com/njucckevin/ADS-Cap.\n","authors":["Kanzhi Cheng","Zheng Ma","Shi Zong","Jianbing Zhang","Xinyu Dai","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01143v1.pdf","comment":"Accepted at Natural Language Processing and Chinese Computing (NLPCC)\n  2022"},{"id":"http://arxiv.org/abs/2305.15386v2","updated":"2023-08-02T13:29:31Z","published":"2023-05-24T17:46:03Z","title":"Vistaar: Diverse Benchmarks and Training Sets for Indian Language ASR","summary":"  Improving ASR systems is necessary to make new LLM-based use-cases accessible\nto people across the globe. In this paper, we focus on Indian languages, and\nmake the case that diverse benchmarks are required to evaluate and improve ASR\nsystems for Indian languages. To address this, we collate Vistaar as a set of\n59 benchmarks across various language and domain combinations, on which we\nevaluate 3 publicly available ASR systems and 2 commercial systems. We also\ntrain IndicWhisper models by fine-tuning the Whisper models on publicly\navailable training datasets across 12 Indian languages totalling to 10.7K\nhours. We show that IndicWhisper significantly improves on considered ASR\nsystems on the Vistaar benchmark. Indeed, IndicWhisper has the lowest WER in 39\nout of the 59 benchmarks, with an average reduction of 4.1 WER. We open-source\nall datasets, code and models.\n","authors":["Kaushal Santosh Bhogale","Sai Sundaresan","Abhigyan Raman","Tahir Javed","Mitesh M. Khapra","Pratyush Kumar"],"pdf_url":"https://arxiv.org/pdf/2305.15386v2.pdf","comment":"Accepted in INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.01126v1","updated":"2023-08-02T13:09:57Z","published":"2023-08-02T13:09:57Z","title":"Beyond Generic: Enhancing Image Captioning with Real-World Knowledge\n  using Vision-Language Pre-Training Model","summary":"  Current captioning approaches tend to generate correct but \"generic\"\ndescriptions that lack real-world knowledge, e.g., named entities and\ncontextual information. Considering that Vision-Language Pre-Training (VLP)\nmodels master massive such knowledge from large-scale web-harvested data, it is\npromising to utilize the generalizability of VLP models to incorporate\nknowledge into image descriptions. However, using VLP models faces challenges:\nzero-shot inference suffers from knowledge hallucination that leads to\nlow-quality descriptions, but the generic bias in downstream task fine-tuning\nhinders the VLP model from expressing knowledge. To address these concerns, we\npropose a simple yet effective method called Knowledge-guided Replay\n(K-Replay), which enables the retention of pre-training knowledge during\nfine-tuning. Our approach consists of two parts: (1) a knowledge prediction\ntask on automatically collected replay exemplars to continuously awaken the VLP\nmodel's memory about knowledge, thus preventing the model from collapsing into\nthe generic pattern; (2) a knowledge distillation constraint to improve the\nfaithfulness of generated descriptions hence alleviating the knowledge\nhallucination. To evaluate knowledge-enhanced descriptions, we construct a\nnovel captioning benchmark KnowCap, containing knowledge of landmarks, famous\nbrands, special foods and movie characters. Experimental results show that our\napproach effectively incorporates knowledge into descriptions, outperforming\nstrong VLP baseline by 20.9 points (78.7->99.6) in CIDEr score and 20.5\npercentage points (34.0%->54.5%) in knowledge recognition accuracy. Our code\nand data is available at https://github.com/njucckevin/KnowCap.\n","authors":["Kanzhi Cheng","Wenpo Song","Zheng Ma","Wenhao Zhu","Zixuan Zhu","Jianbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01126v1.pdf","comment":"Accepted at ACM Multimedia (ACMMM) 2023"},{"id":"http://arxiv.org/abs/2308.01080v1","updated":"2023-08-02T11:04:27Z","published":"2023-08-02T11:04:27Z","title":"Leveraging Few-Shot Data Augmentation and Waterfall Prompting for\n  Response Generation","summary":"  This paper discusses our approaches for task-oriented conversational\nmodelling using subjective knowledge, with a particular emphasis on response\ngeneration. Our methodology was shaped by an extensive data analysis that\nevaluated key factors such as response length, sentiment, and dialogue acts\npresent in the provided dataset. We used few-shot learning to augment the data\nwith newly generated subjective knowledge items and present three approaches\nfor DSTC11: (1) task-specific model exploration, (2) incorporation of the most\nfrequent question into all generated responses, and (3) a waterfall prompting\ntechnique using a combination of both GPT-3 and ChatGPT.\n","authors":["Lea Krause","Selene Báez Santamaría","Michiel van der Meer","Urja Khurana"],"pdf_url":"https://arxiv.org/pdf/2308.01080v1.pdf","comment":"DSTC11"},{"id":"http://arxiv.org/abs/2308.01044v1","updated":"2023-08-02T09:38:29Z","published":"2023-08-02T09:38:29Z","title":"Chat Translation Error Detection for Assisting Cross-lingual\n  Communications","summary":"  In this paper, we describe the development of a communication support system\nthat detects erroneous translations to facilitate crosslingual communications\ndue to the limitations of current machine chat translation methods. We trained\nan error detector as the baseline of the system and constructed a new\nJapanese-English bilingual chat corpus, BPersona-chat, which comprises\nmultiturn colloquial chats augmented with crowdsourced quality ratings. The\nerror detector can serve as an encouraging foundation for more advanced\nerroneous translation detection systems.\n","authors":["Yunmeng Li","Jun Suzuki","Makoto Morishita","Kaori Abe","Ryoko Tokuhisa","Ana Brassard","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2308.01044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05206v2","updated":"2023-08-02T09:24:23Z","published":"2022-12-10T05:07:30Z","title":"Thinking Fast and Slow in Large Language Models","summary":"  Large language models (LLMs) are currently at the forefront of intertwining\nAI systems with human communication and everyday life. Therefore, it is of\ngreat importance to evaluate their emerging abilities. In this study, we show\nthat LLMs like GPT-3 exhibit behavior that strikingly resembles human-like\nintuition - and the cognitive errors that come with it. However, LLMs with\nhigher cognitive capabilities, in particular ChatGPT and GPT-4, learned to\navoid succumbing to these errors and perform in a hyperrational manner. For our\nexperiments, we probe LLMs with the Cognitive Reflection Test (CRT) as well as\nsemantic illusions that were originally designed to investigate intuitive\ndecision-making in humans. Our study demonstrates that investigating LLMs with\nmethods from psychology has the potential to reveal otherwise unknown emergent\ntraits.\n","authors":["Thilo Hagendorff","Sarah Fabi","Michal Kosinski"],"pdf_url":"https://arxiv.org/pdf/2212.05206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16230v2","updated":"2023-08-02T09:11:22Z","published":"2023-07-30T13:43:27Z","title":"A Private Watermark for Large Language Models","summary":"  Recently, text watermarking algorithms for large language models (LLMs) have\nbeen mitigating the potential harms of text generated by the LLMs, including\nfake news and copyright issues. However, the watermark detection of current\ntext algorithms requires the key from the generation process, making them\nsusceptible to breaches and counterfeiting. In this work, we propose the first\nprivate watermarking algorithm, which extends the current text watermarking\nalgorithms by using two different neural networks respectively for watermark\ngeneration and detection, rather than using the same key at both stages.\nMeanwhile, part of the parameters of the watermark generation and detection\nnetworks are shared, which makes the detection network achieve a high accuracy\nvery efficiently. Experiments show that our algorithm ensures high detection\naccuracy with minimal impact on generation and detection speed, due to the\nsmall parameter size of both networks. Additionally, our subsequent analysis\ndemonstrates the difficulty of reverting the watermark generation rules from\nthe detection network.\n","authors":["Aiwei Liu","Leyi Pan","Xuming Hu","Shu'ang Li","Lijie Wen","Irwin King","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2307.16230v2.pdf","comment":"13 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.01018v1","updated":"2023-08-02T08:59:52Z","published":"2023-08-02T08:59:52Z","title":"SALTTS: Leveraging Self-Supervised Speech Representations for improved\n  Text-to-Speech Synthesis","summary":"  While FastSpeech2 aims to integrate aspects of speech such as pitch, energy,\nand duration as conditional inputs, it still leaves scope for richer\nrepresentations. As a part of this work, we leverage representations from\nvarious Self-Supervised Learning (SSL) models to enhance the quality of the\nsynthesized speech. In particular, we pass the FastSpeech2 encoder's\nlength-regulated outputs through a series of encoder layers with the objective\nof reconstructing the SSL representations. In the SALTTS-parallel\nimplementation, the representations from this second encoder are used for an\nauxiliary reconstruction loss with the SSL features. The SALTTS-cascade\nimplementation, however, passes these representations through the decoder in\naddition to having the reconstruction loss. The richness of speech\ncharacteristics from the SSL features reflects in the output speech quality,\nwith the objective and subjective evaluation measures of the proposed approach\noutperforming the baseline FastSpeech2.\n","authors":["Ramanan Sivaguru","Vasista Sai Lodagala","S Umesh"],"pdf_url":"https://arxiv.org/pdf/2308.01018v1.pdf","comment":"Accepted for publication at Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.00436v2","updated":"2023-08-02T08:45:40Z","published":"2023-08-01T10:31:36Z","title":"SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step\n  Reasoning","summary":"  The recent progress in large language models (LLMs), especially the invention\nof chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning\nproblems. However, even the strongest LLMs are still struggling with more\ncomplicated problems that require non-linear thinking and multi-step reasoning.\nIn this work, we explore whether LLMs have the ability to recognize their own\nerrors, without resorting to external resources. In particular, we investigate\nwhether they can be used to identify individual errors within a step-by-step\nreasoning. To this end, we propose a zero-shot verification scheme to recognize\nsuch errors. We then use this verification scheme to improve question-answering\nperformance, by using it to perform weighted voting on different generated\nanswers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and\nfind that it successfully recognizes errors and, in turn, increases final\npredictive performance.\n","authors":["Ning Miao","Yee Whye Teh","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2308.00436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16773v2","updated":"2023-08-02T08:04:29Z","published":"2023-07-31T15:40:45Z","title":"AsdKB: A Chinese Knowledge Base for the Early Screening and Diagnosis of\n  Autism Spectrum Disorder","summary":"  To easily obtain the knowledge about autism spectrum disorder and help its\nearly screening and diagnosis, we create AsdKB, a Chinese knowledge base on\nautism spectrum disorder. The knowledge base is built on top of various\nsources, including 1) the disease knowledge from SNOMED CT and ICD-10 clinical\ndescriptions on mental and behavioural disorders, 2) the diagnostic knowledge\nfrom DSM-5 and different screening tools recommended by social organizations\nand medical institutes, and 3) the expert knowledge on professional physicians\nand hospitals from the Web. AsdKB contains both ontological and factual\nknowledge, and is accessible as Linked Data at https://w3id.org/asdkb/. The\npotential applications of AsdKB are question answering, auxiliary diagnosis,\nand expert recommendation, and we illustrate them with a prototype which can be\naccessed at http://asdkb.org.cn/.\n","authors":["Tianxing Wu","Xudong Cao","Yipeng Zhu","Feiyue Wu","Tianling Gong","Yuxiang Wang","Shenqi Jing"],"pdf_url":"https://arxiv.org/pdf/2307.16773v2.pdf","comment":"17 pages, Accepted by the Resource Track of ISWC 2023"},{"id":"http://arxiv.org/abs/2307.16125v2","updated":"2023-08-02T08:02:35Z","published":"2023-07-30T04:25:16Z","title":"SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension","summary":"  Based on powerful Large Language Models (LLMs), recent generative Multimodal\nLarge Language Models (MLLMs) have gained prominence as a pivotal research\narea, exhibiting remarkable capability for both comprehension and generation.\nIn this work, we address the evaluation of generative comprehension in MLLMs as\na preliminary step towards a comprehensive assessment of generative models, by\nintroducing a benchmark named SEED-Bench. SEED-Bench consists of 19K multiple\nchoice questions with accurate human annotations (x 6 larger than existing\nbenchmarks), which spans 12 evaluation dimensions including the comprehension\nof both the image and video modality. We develop an advanced pipeline for\ngenerating multiple-choice questions that target specific evaluation\ndimensions, integrating both automatic filtering and manual verification\nprocesses. Multiple-choice questions with groundtruth options derived from\nhuman annotation enables an objective and efficient assessment of model\nperformance, eliminating the need for human or GPT intervention during\nevaluation. We further evaluate the performance of 18 models across all 12\ndimensions, covering both the spatial and temporal understanding. By revealing\nthe limitations of existing MLLMs through evaluation results, we aim for\nSEED-Bench to provide insights for motivating future research. We will launch\nand consistently maintain a leaderboard to provide a platform for the community\nto assess and investigate model capability.\n","authors":["Bohao Li","Rui Wang","Guangzhi Wang","Yuying Ge","Yixiao Ge","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2307.16125v2.pdf","comment":"Technical Report; Project released at:\n  https://github.com/AILab-CVC/SEED-Bench"},{"id":"http://arxiv.org/abs/2307.16648v2","updated":"2023-08-02T07:47:26Z","published":"2023-07-31T13:27:21Z","title":"LLMs4OL: Large Language Models for Ontology Learning","summary":"  We propose the LLMs4OL approach, which utilizes Large Language Models (LLMs)\nfor Ontology Learning (OL). LLMs have shown significant advancements in natural\nlanguage processing, demonstrating their ability to capture complex language\npatterns in different knowledge domains. Our LLMs4OL paradigm investigates the\nfollowing hypothesis: \\textit{Can LLMs effectively apply their language pattern\ncapturing capability to OL, which involves automatically extracting and\nstructuring knowledge from natural language text?} To test this hypothesis, we\nconduct a comprehensive evaluation using the zero-shot prompting method. We\nevaluate nine different LLM model families for three main OL tasks: term\ntyping, taxonomy discovery, and extraction of non-taxonomic relations.\nAdditionally, the evaluations encompass diverse genres of ontological\nknowledge, including lexicosemantic knowledge in WordNet, geographical\nknowledge in GeoNames, and medical knowledge in UMLS.\n","authors":["Hamed Babaei Giglou","Jennifer D'Souza","Sören Auer"],"pdf_url":"https://arxiv.org/pdf/2307.16648v2.pdf","comment":"15 pages main content, 27 pages overall, 2 Figures, accepted for\n  publication at ISWC 2023 research track"},{"id":"http://arxiv.org/abs/2307.03109v6","updated":"2023-08-02T07:39:17Z","published":"2023-07-06T16:28:35Z","title":"A Survey on Evaluation of Large Language Models","summary":"  Large language models (LLMs) are gaining increasing popularity in both\nacademia and industry, owing to their unprecedented performance in various\napplications. As LLMs continue to play a vital role in both research and daily\nuse, their evaluation becomes increasingly critical, not only at the task\nlevel, but also at the society level for better understanding of their\npotential risks. Over the past years, significant efforts have been made to\nexamine LLMs from various perspectives. This paper presents a comprehensive\nreview of these evaluation methods for LLMs, focusing on three key dimensions:\nwhat to evaluate, where to evaluate, and how to evaluate. Firstly, we provide\nan overview from the perspective of evaluation tasks, encompassing general\nnatural language processing tasks, reasoning, medical usage, ethics,\neducations, natural and social sciences, agent applications, and other areas.\nSecondly, we answer the `where' and `how' questions by diving into the\nevaluation methods and benchmarks, which serve as crucial components in\nassessing performance of LLMs. Then, we summarize the success and failure cases\nof LLMs in different tasks. Finally, we shed light on several future challenges\nthat lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to\nresearchers in the realm of LLMs evaluation, thereby aiding the development of\nmore proficient LLMs. Our key point is that evaluation should be treated as an\nessential discipline to better assist the development of LLMs. We consistently\nmaintain the related open-source materials at:\nhttps://github.com/MLGroupJLU/LLM-eval-survey.\n","authors":["Yupeng Chang","Xu Wang","Jindong Wang","Yuan Wu","Linyi Yang","Kaijie Zhu","Hao Chen","Xiaoyuan Yi","Cunxiang Wang","Yidong Wang","Wei Ye","Yue Zhang","Yi Chang","Philip S. Yu","Qiang Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.03109v6.pdf","comment":"26 pages; a major update to include more recent works;\n  https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.00081v2","updated":"2023-08-02T07:34:24Z","published":"2023-07-31T18:53:47Z","title":"Towards Semantically Enriched Embeddings for Knowledge Graph Completion","summary":"  Embedding based Knowledge Graph (KG) Completion has gained much attention\nover the past few years. Most of the current algorithms consider a KG as a\nmultidirectional labeled graph and lack the ability to capture the semantics\nunderlying the schematic information. In a separate development, a vast amount\nof information has been captured within the Large Language Models (LLMs) which\nhas revolutionized the field of Artificial Intelligence. KGs could benefit from\nthese LLMs and vice versa. This vision paper discusses the existing algorithms\nfor KG completion based on the variations for generating KG embeddings. It\nstarts with discussing various KG completion algorithms such as transductive\nand inductive link prediction and entity type prediction algorithms. It then\nmoves on to the algorithms utilizing type information within the KGs, LLMs, and\nfinally to algorithms capturing the semantics represented in different\ndescription logic axioms. We conclude the paper with a critical reflection on\nthe current state of work in the community and give recommendations for future\ndirections.\n","authors":["Mehwish Alam","Frank van Harmelen","Maribel Acosta"],"pdf_url":"https://arxiv.org/pdf/2308.00081v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12490v2","updated":"2023-08-02T06:21:28Z","published":"2022-05-25T04:40:17Z","title":"Improve Event Extraction via Self-Training with Gradient Guidance","summary":"  Data scarcity has been the main factor that hinders the progress of event\nextraction. To overcome this issue, we propose a Self-Training with Feedback\n(STF) framework that leverages the large-scale unlabeled data and acquires\nfeedback for each new event prediction from the unlabeled data by comparing it\nto the Abstract Meaning Representation (AMR) graph of the same sentence.\nSpecifically, STF consists of (1) a base event extraction model trained on\nexisting event annotations and then applied to large-scale unlabeled corpora to\npredict new event mentions as pseudo training samples, and (2) a novel scoring\nmodel that takes in each new predicted event trigger, an argument, its argument\nrole, as well as their paths in the AMR graph to estimate a compatibility score\nindicating the correctness of the pseudo label. The compatibility scores\nfurther act as feedback to encourage or discourage the model learning on the\npseudo labels during self-training. Experimental results on three benchmark\ndatasets, including ACE05-E, ACE05-E+, and ERE, demonstrate the effectiveness\nof the STF framework on event extraction, especially event argument extraction,\nwith significant performance gain over the base event extraction models and\nstrong baselines. Our experimental analysis further shows that STF is a generic\nframework as it can be applied to improve most, if not all, event extraction\nmodels by leveraging large-scale unlabeled data, even when high-quality AMR\ngraph annotations are not available.\n","authors":["Zhiyang Xu","Jay-Yoon Lee","Lifu Huang"],"pdf_url":"https://arxiv.org/pdf/2205.12490v2.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2203.16266v2","updated":"2023-08-02T06:13:35Z","published":"2022-03-30T12:53:20Z","title":"DePA: Improving Non-autoregressive Machine Translation with\n  Dependency-Aware Decoder","summary":"  Non-autoregressive machine translation (NAT) models have lower translation\nquality than autoregressive translation (AT) models because NAT decoders do not\ndepend on previous target tokens in the decoder input. We propose a novel and\ngeneral Dependency-Aware Decoder (DePA) to enhance target dependency modeling\nin the decoder of fully NAT models from two perspectives: decoder\nself-attention and decoder input. First, we propose an autoregressive\nforward-backward pre-training phase before NAT training, which enables the NAT\ndecoder to gradually learn bidirectional target dependencies for the final NAT\ntraining. Second, we transform the decoder input from the source language\nrepresentation space to the target language representation space through a\nnovel attentive transformation process, which enables the decoder to better\ncapture target dependencies. DePA can be applied to any fully NAT models.\nExtensive experiments show that DePA consistently improves highly competitive\nand state-of-the-art fully NAT models on widely used WMT and IWSLT benchmarks\nby up to 1.88 BLEU gain, while maintaining the inference latency comparable to\nother fully NAT models.\n","authors":["Jiaao Zhan","Qian Chen","Boxing Chen","Wen Wang","Yu Bai","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2203.16266v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01590v3","updated":"2023-08-02T05:39:25Z","published":"2023-03-02T21:27:54Z","title":"Technical report: Graph Neural Networks go Grammatical","summary":"  This paper proposes a framework to formally link a fragment of an algebraic\nlanguage to a Graph Neural Network (GNN). It relies on Context Free Grammars\n(CFG) to organise algebraic operations into generative rules that can be\ntranslated into a GNN layer model. Since the rules and variables of a CFG\ndirectly derived from a language contain redundancies, a grammar reduction\nscheme is presented making tractable the translation into a GNN layer. Applying\nthis strategy, a grammar compliant with the third-order Weisfeiler-Lehman\n(3-WL) test is defined from MATLANG. From this 3-WL CFG, we derive a provably\n3-WL GNN model called G$^2$N$^2$. Moreover, this grammatical approach allows us\nto provide algebraic formulas to count the cycles of length up to six and\nchordal cycles at the edge level, which enlightens the counting power of 3-WL.\nSeveral experiments illustrate that G$^2$N$^2$ efficiently outperforms other\n3-WL GNNs on many downstream tasks.\n","authors":["Jason Piquenot","Aldo Moscatelli","Maxime Bérar","Pierre Héroux","Romain raveaux","Jean-Yves Ramel","Sébastien Adam"],"pdf_url":"https://arxiv.org/pdf/2303.01590v3.pdf","comment":"27 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00946v1","updated":"2023-08-02T05:00:12Z","published":"2023-08-02T05:00:12Z","title":"Teaching Smaller Language Models To Generalise To Unseen Compositional\n  Questions","summary":"  We equip a smaller Language Model to generalise to answering challenging\ncompositional questions that have not been seen in training. To do so we\npropose a combination of multitask supervised pretraining on up to 93 tasks\ndesigned to instill diverse reasoning abilities, and a dense retrieval system\nthat aims to retrieve a set of evidential paragraph fragments. Recent progress\nin question-answering has been achieved either through prompting methods\nagainst very large pretrained Language Models in zero or few-shot fashion, or\nby fine-tuning smaller models, sometimes in conjunction with information\nretrieval. We focus on the less explored question of the extent to which\nzero-shot generalisation can be enabled in smaller models with retrieval\nagainst a corpus within which sufficient information to answer a particular\nquestion may not exist. We establish strong baselines in this setting for\ndiverse evaluation datasets (StrategyQA, CommonsenseQA, IIRC, DROP, Musique and\nARC-DA), and show that performance can be significantly improved by adding\nretrieval-augmented training datasets which are designed to expose our models\nto a variety of heuristic reasoning strategies such as weighing partial\nevidence or ignoring an irrelevant context.\n","authors":["Tim Hartill","Neset TAN","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2308.00946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00939v1","updated":"2023-08-02T04:43:54Z","published":"2023-08-02T04:43:54Z","title":"Feature-aware conditional GAN for category text generation","summary":"  Category text generation receives considerable attentions since it is\nbeneficial for various natural language processing tasks. Recently, the\ngenerative adversarial network (GAN) has attained promising performance in text\ngeneration, attributed to its adversarial training process. However, there are\nseveral issues in text GANs, including discreteness, training instability, mode\ncollapse, lack of diversity and controllability etc. To address these issues,\nthis paper proposes a novel GAN framework, the feature-aware conditional GAN\n(FA-GAN), for controllable category text generation. In FA-GAN, the generator\nhas a sequence-to-sequence structure for improving sentence diversity, which\nconsists of three encoders including a special feature-aware encoder and a\ncategory-aware encoder, and one relational-memory-core-based decoder with the\nGumbel SoftMax activation function. The discriminator has an additional\ncategory classification head. To generate sentences with specified categories,\nthe multi-class classification loss is supplemented in the adversarial\ntraining. Comprehensive experiments have been conducted, and the results show\nthat FA-GAN consistently outperforms 10 state-of-the-art text generation\napproaches on 6 text classification datasets. The case study demonstrates that\nthe synthetic sentences generated by FA-GAN can match the required categories\nand are aware of the features of conditioned sentences, with good readability,\nfluency, and text authenticity.\n","authors":["Xinze Li","Kezhi Mao","Fanfan Lin","Zijian Feng"],"pdf_url":"https://arxiv.org/pdf/2308.00939v1.pdf","comment":"27 pages, 8 figures"},{"id":"http://arxiv.org/abs/2302.12468v2","updated":"2023-08-02T04:22:57Z","published":"2023-02-24T05:48:53Z","title":"Adapting Prompt for Few-shot Table-to-Text Generation","summary":"  Pretrained language models (PLMs) have made remarkable progress in\ntable-to-text generation tasks. However, the lack of domain-specific knowledge\nmakes it challenging to bridge the topological gap between tabular data and\ntext, especially in real-world applications with limited resources. To mitigate\nthe limitation of insufficient labeled data, we propose a novel framework:\nAdapt-Prompt-to-Generate (AdaPTGen). The core insight of AdaPTGen is to adapt\nprompt templates of domain-specific knowledge into the model, which brings at\nleast three benefits: (1) it injects representation of normal table-related\ndescriptions to bridge the topological gap between tabular data and texts; (2)\nit enables us to use large amounts of unlabeled domain-specific knowledge\nfully, which can alleviate the PLMs' inherent shortcomings of lacking domain\nknowledge; (3) it allows us to design various tasks to explore the\ndomain-specific knowledge. Extensive experiments and analyses are conducted on\nthree open-domain few-shot natural language generation (NLG) data sets: Humans,\nSongs, and Books. Compared to previous state-of-the-art approaches, our model\nachieves superior performance in terms of both fluency and accuracy.\n","authors":["Zhixin Guo","Minyxuan Yan","Jiexing Qi","Jianping Zhou","Ziwei He","Zhouhan Lin","Guanjie Zheng","Xinbing Wang"],"pdf_url":"https://arxiv.org/pdf/2302.12468v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.04415"},{"id":"http://arxiv.org/abs/2304.14233v2","updated":"2023-08-02T02:06:28Z","published":"2023-04-27T14:45:55Z","title":"Large Language Models are Strong Zero-Shot Retriever","summary":"  In this work, we propose a simple method that applies a large language model\n(LLM) to large-scale retrieval in zero-shot scenarios. Our method, the Language\nlanguage model as Retriever (LameR), is built upon no other neural models but\nan LLM, while breaking brute-force combinations of retrievers with LLMs and\nlifting the performance of zero-shot retrieval to be very competitive on\nbenchmark datasets. Essentially, we propose to augment a query with its\npotential answers by prompting LLMs with a composition of the query and the\nquery's in-domain candidates. The candidates, regardless of correct or wrong,\nare obtained by a vanilla retrieval procedure on the target collection. As a\npart of the prompts, they are likely to help LLM generate more precise answers\nby pattern imitation or candidate summarization. Even if all the candidates are\nwrong, the prompts at least make LLM aware of in-collection patterns and\ngenres. Moreover, due to the low performance of a self-supervised retriever,\nthe LLM-based query augmentation becomes less effective as the retriever\nbottlenecks the whole pipeline. Therefore, we propose to leverage a\nnon-parametric lexicon-based method (e.g., BM25) as the retrieval module to\ncapture query-document overlap in a literal fashion. As such, LameR makes the\nretrieval procedure transparent to the LLM, thus circumventing the performance\nbottleneck.\n","authors":["Tao Shen","Guodong Long","Xiubo Geng","Chongyang Tao","Tianyi Zhou","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.14233v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/1706.03762v7","updated":"2023-08-02T00:41:18Z","published":"2017-06-12T17:57:34Z","title":"Attention Is All You Need","summary":"  The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntranslation task, our model establishes a new single-model state-of-the-art\nBLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction\nof the training costs of the best models from the literature. We show that the\nTransformer generalizes well to other tasks by applying it successfully to\nEnglish constituency parsing both with large and limited training data.\n","authors":["Ashish Vaswani","Noam Shazeer","Niki Parmar","Jakob Uszkoreit","Llion Jones","Aidan N. Gomez","Lukasz Kaiser","Illia Polosukhin"],"pdf_url":"https://arxiv.org/pdf/1706.03762v7.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.16039v2","updated":"2023-08-02T00:39:25Z","published":"2023-07-29T18:01:46Z","title":"Okapi: Instruction-tuned Large Language Models in Multiple Languages\n  with Reinforcement Learning from Human Feedback","summary":"  A key technology for the development of large language models (LLMs) involves\ninstruction tuning that helps align the models' responses with human\nexpectations to realize impressive learning abilities. Two major approaches for\ninstruction tuning characterize supervised fine-tuning (SFT) and reinforcement\nlearning from human feedback (RLHF), which are currently applied to produce the\nbest commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for\nresearch and development efforts, various instruction-tuned open-source LLMs\nhave also been introduced recently, e.g., Alpaca, Vicuna, to name a few.\nHowever, existing open-source LLMs have only been instruction-tuned for English\nand a few popular languages, thus hindering their impacts and accessibility to\nmany other languages in the world. Among a few very recent work to explore\ninstruction tuning for LLMs in multiple languages, SFT has been used as the\nonly approach to instruction-tune LLMs for multiple languages. This has left a\nsignificant gap for fine-tuned LLMs based on RLHF in diverse languages and\nraised important questions on how RLHF can boost the performance of\nmultilingual instruction tuning. To overcome this issue, we present Okapi, the\nfirst system with instruction-tuned LLMs based on RLHF for multiple languages.\nOkapi introduces instruction and response-ranked data in 26 diverse languages\nto facilitate the experiments and development of future multilingual LLM\nresearch. We also present benchmark datasets to enable the evaluation of\ngenerative LLMs in multiple languages. Our experiments demonstrate the\nadvantages of RLHF for multilingual instruction over SFT for different base\nmodels and datasets. Our framework and resources are released at\nhttps://github.com/nlp-uoregon/Okapi.\n","authors":["Viet Dac Lai","Chien Van Nguyen","Nghia Trung Ngo","Thuat Nguyen","Franck Dernoncourt","Ryan A. Rossi","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.16039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11771v2","updated":"2023-08-02T00:33:22Z","published":"2023-07-18T00:23:35Z","title":"An Integrated NPL Approach to Sentiment Analysis in Satisfaction Surveys","summary":"  The research project aims to apply an integrated approach to natural language\nprocessing NLP to satisfaction surveys. It will focus on understanding and\nextracting relevant information from survey responses, analyzing feelings, and\nidentifying recurring word patterns. NLP techniques will be used to determine\nemotional polarity, classify responses into positive, negative, or neutral\ncategories, and use opinion mining to highlight participants opinions. This\napproach will help identify the most relevant aspects for participants and\nunderstand their opinions in relation to those specific aspects. A key\ncomponent of the research project will be the analysis of word patterns in\nsatisfaction survey responses using NPL. This analysis will provide a deeper\nunderstanding of feelings, opinions, and themes and trends present in\nrespondents responses. The results obtained from this approach can be used to\nidentify areas for improvement, understand respondents preferences, and make\nstrategic decisions based on analysis to improve respondent satisfaction.\n","authors":["Edson B. Pinto-Luque"],"pdf_url":"https://arxiv.org/pdf/2307.11771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01472v1","updated":"2023-08-02T23:39:29Z","published":"2023-08-02T23:39:29Z","title":"Reverse Stable Diffusion: What prompt was used to generate this image?","summary":"  Text-to-image diffusion models such as Stable Diffusion have recently\nattracted the interest of many researchers, and inverting the diffusion process\ncan play an important role in better understanding the generative process and\nhow to engineer prompts in order to obtain the desired images. To this end, we\nintroduce the new task of predicting the text prompt given an image generated\nby a generative diffusion model. We combine a series of white-box and black-box\nmodels (with and without access to the weights of the diffusion network) to\ndeal with the proposed task. We propose a novel learning framework comprising\nof a joint prompt regression and multi-label vocabulary classification\nobjective that generates improved prompts. To further improve our method, we\nemploy a curriculum learning procedure that promotes the learning of\nimage-prompt pairs with lower labeling noise (i.e. that are better aligned),\nand an unsupervised domain-adaptive kernel learning method that uses the\nsimilarities between samples in the source and target domains as extra\nfeatures. We conduct experiments on the DiffusionDB data set, predicting text\nprompts from images generated by Stable Diffusion. Our novel learning framework\nproduces excellent results on the aforementioned task, yielding the highest\ngains when applied on the white-box model. In addition, we make an interesting\ndiscovery: training a diffusion model on the prompt generation task can make\nthe model generate images that are much better aligned with the input prompts,\nwhen the model is directly reused for text-to-image generation.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.01472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01408v1","updated":"2023-08-02T20:08:59Z","published":"2023-08-02T20:08:59Z","title":"UPB at IberLEF-2023 AuTexTification: Detection of Machine-Generated Text\n  using Transformer Ensembles","summary":"  This paper describes the solutions submitted by the UPB team to the\nAuTexTification shared task, featured as part of IberLEF-2023. Our team\nparticipated in the first subtask, identifying text documents produced by large\nlanguage models instead of humans. The organizers provided a bilingual dataset\nfor this subtask, comprising English and Spanish texts covering multiple\ndomains, such as legal texts, social media posts, and how-to articles. We\nexperimented mostly with deep learning models based on Transformers, as well as\ntraining techniques such as multi-task learning and virtual adversarial\ntraining to obtain better results. We submitted three runs, two of which\nconsisted of ensemble models. Our best-performing model achieved macro\nF1-scores of 66.63% on the English dataset and 67.10% on the Spanish dataset.\n","authors":["Andrei-Alexandru Preda","Dumitru-Clementin Cercel","Traian Rebedea","Costin-Gabriel Chiru"],"pdf_url":"https://arxiv.org/pdf/2308.01408v1.pdf","comment":"10 pages. Accepted for publication in the IberLEF 2023 Proceedings,\n  at https://ceur-ws.org/"},{"id":"http://arxiv.org/abs/2308.01391v1","updated":"2023-08-02T19:11:04Z","published":"2023-08-02T19:11:04Z","title":"Optimizing Machine Translation through Prompt Engineering: An\n  Investigation into ChatGPT's Customizability","summary":"  This paper explores the influence of integrating the purpose of the\ntranslation and the target audience into prompts on the quality of translations\nproduced by ChatGPT. Drawing on previous translation studies, industry\npractices, and ISO standards, the research underscores the significance of the\npre-production phase in the translation process. The study reveals that the\ninclusion of suitable prompts in large-scale language models like ChatGPT can\nyield flexible translations, a feat yet to be realized by conventional Machine\nTranslation (MT). The research scrutinizes the changes in translation quality\nwhen prompts are used to generate translations that meet specific conditions.\nThe evaluation is conducted from a practicing translator's viewpoint, both\nsubjectively and qualitatively, supplemented by the use of OpenAI's word\nembedding API for cosine similarity calculations. The findings suggest that the\nintegration of the purpose and target audience into prompts can indeed modify\nthe generated translations, generally enhancing the translation quality by\nindustry standards. The study also demonstrates the practical application of\nthe \"good translation\" concept, particularly in the context of marketing\ndocuments and culturally dependent idioms.\n","authors":["Masaru Yamada"],"pdf_url":"https://arxiv.org/pdf/2308.01391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04370v5","updated":"2023-08-02T19:00:53Z","published":"2023-04-10T03:55:35Z","title":"OpenAGI: When LLM Meets Domain Experts","summary":"  Human intelligence excels at combining basic skills to solve complex tasks.\nThis capability is vital for Artificial Intelligence (AI) and should be\nembedded in comprehensive intelligent models, enabling them to harness expert\nmodels for complex task-solving towards Artificial General Intelligence (AGI).\nLarge Language Models (LLMs) show promising learning and reasoning abilities,\nand can effectively use external models, tools or APIs to tackle complex\nproblems. In this work, we introduce OpenAGI, an open-source AGI research\nplatform designed for multi-step, real-world tasks. Specifically, OpenAGI uses\na dual strategy, integrating standard benchmark tasks for benchmarking and\nevaluation, and open-ended tasks including more expandable models, tools or\nAPIs for creative problem-solving. Tasks are presented as natural language\nqueries to the LLM, which then selects and executes appropriate models. We also\npropose a Reinforcement Learning from Task Feedback (RLTF) mechanism that uses\ntask results to improve the LLM's ability, which creates a self-improving AI\nfeedback loop. While we acknowledge that AGI is a broad and multifaceted\nresearch challenge with no singularly defined solution path, the integration of\nLLMs with domain-specific expert models, inspired by mirroring the blend of\ngeneral and specialized intelligence in humans, offers a promising approach\ntowards AGI. We are open-sourcing the OpenAGI project's code, dataset,\nbenchmarks, evaluation methods, and demo to foster community involvement in AGI\nadvancement: https://github.com/agiresearch/OpenAGI.\n","authors":["Yingqiang Ge","Wenyue Hua","Kai Mei","Jianchao Ji","Juntao Tan","Shuyuan Xu","Zelong Li","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.04370v5.pdf","comment":"22 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2308.01320v1","updated":"2023-08-02T18:49:57Z","published":"2023-08-02T18:49:57Z","title":"DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like\n  Models at All Scales","summary":"  ChatGPT-like models have revolutionized various applications in artificial\nintelligence, from summarization and coding to translation, matching or even\nsurpassing human performance. However, the current landscape lacks an\naccessible, efficient, and cost-effective end-to-end RLHF (Reinforcement\nLearning with Human Feedback) training pipeline for these powerful models,\nparticularly when training at the scale of billions of parameters. This paper\nintroduces DeepSpeed-Chat, a novel system that democratizes RLHF training,\nmaking it accessible to the AI community. DeepSpeed-Chat offers three key\ncapabilities: an easy-to-use training and inference experience for ChatGPT-like\nmodels, a DeepSpeed-RLHF pipeline that replicates the training pipeline from\nInstructGPT, and a robust DeepSpeed-RLHF system that combines various\noptimizations for training and inference in a unified way. The system delivers\nunparalleled efficiency and scalability, enabling training of models with\nhundreds of billions of parameters in record time and at a fraction of the\ncost. With this development, DeepSpeed-Chat paves the way for broader access to\nadvanced RLHF training, even for data scientists with limited resources,\nthereby fostering innovation and further development in the field of AI.\n","authors":["Zhewei Yao","Reza Yazdani Aminabadi","Olatunji Ruwase","Samyam Rajbhandari","Xiaoxia Wu","Ammar Ahmad Awan","Jeff Rasley","Minjia Zhang","Conglong Li","Connor Holmes","Zhongzhu Zhou","Michael Wyatt","Molly Smith","Lev Kurilenko","Heyang Qin","Masahiro Tanaka","Shuai Che","Shuaiwen Leon Song","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2308.01320v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.01368v1","updated":"2023-08-02T18:22:49Z","published":"2023-08-02T18:22:49Z","title":"Empirical Translation Process Research: Past and Possible Future\n  Perspectives","summary":"  Over the past four decades, efforts have been made to develop and evaluate\nmodels for Empirical Translation Process Research (TPR), yet a comprehensive\nframework remains elusive. This article traces the evolution of empirical TPR\nwithin the CRITT TPR-DB tradition and proposes the Free Energy Principle (FEP)\nand Active Inference (AIF) as a framework for modeling deeply embedded\ntranslation processes. It introduces novel approaches for quantifying\nfundamental concepts of Relevance Theory (relevance, s-mode, i-mode), and\nestablishes their relation to the Monitor Model, framing relevance maximization\nas a special case of minimizing free energy. FEP/AIF provides a mathematically\nrigorous foundation that enables modeling of deep temporal architectures in\nwhich embedded translation processes unfold on different timelines. This\nframework opens up exciting prospects for future research in predictive TPR,\nlikely to enrich our comprehension of human translation processes, and making\nvaluable contributions to the wider realm of translation studies and the design\nof cognitive architectures.\n","authors":["Michael Carl"],"pdf_url":"https://arxiv.org/pdf/2308.01368v1.pdf","comment":"To be published in Translation, Cognition and Behavior: \"Translation\n  and cognition in the 21st century: Goals met, goals ahead\", John Benjamins"},{"id":"http://arxiv.org/abs/2308.01327v1","updated":"2023-08-02T15:53:59Z","published":"2023-08-02T15:53:59Z","title":"Careful Whisper -- leveraging advances in automatic speech recognition\n  for robust and interpretable aphasia subtype classification","summary":"  This paper presents a fully automated approach for identifying speech\nanomalies from voice recordings to aid in the assessment of speech impairments.\nBy combining Connectionist Temporal Classification (CTC) and\nencoder-decoder-based automatic speech recognition models, we generate rich\nacoustic and clean transcripts. We then apply several natural language\nprocessing methods to extract features from these transcripts to produce\nprototypes of healthy speech. Basic distance measures from these prototypes\nserve as input features for standard machine learning classifiers, yielding\nhuman-level accuracy for the distinction between recordings of people with\naphasia and a healthy control group. Furthermore, the most frequently occurring\naphasia types can be distinguished with 90% accuracy. The pipeline is directly\napplicable to other diseases and languages, showing promise for robustly\nextracting diagnostic speech biomarkers.\n","authors":["Laurin Wagner","Mario Zusag","Theresa Bloder"],"pdf_url":"https://arxiv.org/pdf/2308.01327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01936v1","updated":"2023-08-02T21:13:38Z","published":"2023-08-02T21:13:38Z","title":"Why Do We Need Neuro-symbolic AI to Model Pragmatic Analogies?","summary":"  A hallmark of intelligence is the ability to use a familiar domain to make\ninferences about a less familiar domain, known as analogical reasoning. In this\narticle, we delve into the performance of Large Language Models (LLMs) in\ndealing with progressively complex analogies expressed in unstructured text. We\ndiscuss analogies at four distinct levels of complexity: lexical analogies,\nsyntactic analogies, semantic analogies, and pragmatic analogies. As the\nanalogies become more complex, they require increasingly extensive, diverse\nknowledge beyond the textual content, unlikely to be found in the lexical\nco-occurrence statistics that power LLMs. To address this, we discuss the\nnecessity of employing Neuro-symbolic AI techniques that combine statistical\nand symbolic AI, informing the representation of unstructured text to highlight\nand augment relevant content, provide abstraction and guide the mapping\nprocess. Our knowledge-informed approach maintains the efficiency of LLMs while\npreserving the ability to explain analogies for pedagogical applications.\n","authors":["Thilini Wijesiriwardene","Amit Sheth","Valerie L. Shalin","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2308.01936v1.pdf","comment":"12 pages 3 figures"},{"id":"http://arxiv.org/abs/2308.01927v1","updated":"2023-08-02T11:39:19Z","published":"2023-08-02T11:39:19Z","title":"MultiEM: Efficient and Effective Unsupervised Multi-Table Entity\n  Matching","summary":"  Entity Matching (EM), which aims to identify all entity pairs referring to\nthe same real-world entity from relational tables, is one of the most important\ntasks in real-world data management systems. Due to the labeling process of EM\nbeing extremely labor-intensive, unsupervised EM is more applicable than\nsupervised EM in practical scenarios. Traditional unsupervised EM assumes that\nall entities come from two tables; however, it is more common to match entities\nfrom multiple tables in practical applications, that is, multi-table entity\nmatching (multi-table EM). Unfortunately, effective and efficient unsupervised\nmulti-table EM remains under-explored. To fill this gap, this paper formally\nstudies the problem of unsupervised multi-table entity matching and proposes an\neffective and efficient solution, termed as MultiEM. MultiEM is a parallelable\npipeline of enhanced entity representation, table-wise hierarchical merging,\nand density-based pruning. Extensive experimental results on six real-world\nbenchmark datasets demonstrate the superiority of MultiEM in terms of\neffectiveness and efficiency.\n","authors":["Xiaocan Zeng","Pengfei Wang","Yuren Mao","Lu Chen","Xiaoze Liu","Yunjun Gao"],"pdf_url":"https://arxiv.org/pdf/2308.01927v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.01317v1","updated":"2023-08-02T17:59:45Z","published":"2023-08-02T17:59:45Z","title":"ELIXR: Towards a general purpose X-ray artificial intelligence system\n  through alignment of large language models and radiology vision encoders","summary":"  Our approach, which we call Embeddings for Language/Image-aligned X-Rays, or\nELIXR, leverages a language-aligned image encoder combined or grafted onto a\nfixed LLM, PaLM 2, to perform a broad range of tasks. We train this lightweight\nadapter architecture using images paired with corresponding free-text radiology\nreports from the MIMIC-CXR dataset. ELIXR achieved state-of-the-art performance\non zero-shot chest X-ray (CXR) classification (mean AUC of 0.850 across 13\nfindings), data-efficient CXR classification (mean AUCs of 0.893 and 0.898\nacross five findings (atelectasis, cardiomegaly, consolidation, pleural\neffusion, and pulmonary edema) for 1% (~2,200 images) and 10% (~22,000 images)\ntraining data), and semantic search (0.76 normalized discounted cumulative gain\n(NDCG) across nineteen queries, including perfect retrieval on twelve of them).\nCompared to existing data-efficient methods including supervised contrastive\nlearning (SupCon), ELIXR required two orders of magnitude less data to reach\nsimilar performance. ELIXR also showed promise on CXR vision-language tasks,\ndemonstrating overall accuracies of 58.7% and 62.5% on visual question\nanswering and report quality assurance tasks, respectively. These results\nsuggest that ELIXR is a robust and versatile approach to CXR AI.\n","authors":["Shawn Xu","Lin Yang","Christopher Kelly","Marcin Sieniek","Timo Kohlberger","Martin Ma","Wei-Hung Weng","Attila Kiraly","Sahar Kazemzadeh","Zakkai Melamed","Jungyeon Park","Patricia Strachan","Yun Liu","Chuck Lau","Preeti Singh","Christina Chen","Mozziyar Etemadi","Sreenivasa Raju Kalidindi","Yossi Matias","Katherine Chou","Greg S. Corrado","Shravya Shetty","Daniel Tse","Shruthi Prabhakara","Daniel Golden","Rory Pilgrim","Krish Eswaran","Andrew Sellergren"],"pdf_url":"https://arxiv.org/pdf/2308.01317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01316v1","updated":"2023-08-02T17:58:01Z","published":"2023-08-02T17:58:01Z","title":"Patched Denoising Diffusion Models For High-Resolution Image Synthesis","summary":"  We propose an effective denoising diffusion model for generating\nhigh-resolution images (e.g., 1024$\\times$512), trained on small-size image\npatches (e.g., 64$\\times$64). We name our algorithm Patch-DM, in which a new\nfeature collage strategy is designed to avoid the boundary artifact when\nsynthesizing large-size images. Feature collage systematically crops and\ncombines partial features of the neighboring patches to predict the features of\na shifted image patch, allowing the seamless generation of the entire image due\nto the overlap in the patch feature space. Patch-DM produces high-quality image\nsynthesis results on our newly collected dataset of nature images\n(1024$\\times$512), as well as on standard benchmarks of smaller sizes\n(256$\\times$256), including LSUN-Bedroom, LSUN-Church, and FFHQ. We compare our\nmethod with previous patch-based generation methods and achieve\nstate-of-the-art FID scores on all four datasets. Further, Patch-DM also\nreduces memory complexity compared to the classic diffusion models.\n","authors":["Zheng Ding","Mengqi Zhang","Jiajun Wu","Zhuowen Tu"],"pdf_url":"https://arxiv.org/pdf/2308.01316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01313v1","updated":"2023-08-02T17:57:25Z","published":"2023-08-02T17:57:25Z","title":"More Context, Less Distraction: Visual Classification by Inferring and\n  Conditioning on Contextual Attributes","summary":"  CLIP, as a foundational vision language model, is widely used in zero-shot\nimage classification due to its ability to understand various visual concepts\nand natural language descriptions. However, how to fully leverage CLIP's\nunprecedented human-like understanding capabilities to achieve better zero-shot\nclassification is still an open question. This paper draws inspiration from the\nhuman visual perception process: a modern neuroscience view suggests that in\nclassifying an object, humans first infer its class-independent attributes\n(e.g., background and orientation) which help separate the foreground object\nfrom the background, and then make decisions based on this information.\nInspired by this, we observe that providing CLIP with contextual attributes\nimproves zero-shot classification and mitigates reliance on spurious features.\nWe also observe that CLIP itself can reasonably infer the attributes from an\nimage. With these observations, we propose a training-free, two-step zero-shot\nclassification method named PerceptionCLIP. Given an image, it first infers\ncontextual attributes (e.g., background) and then performs object\nclassification conditioning on them. Our experiments show that PerceptionCLIP\nachieves better generalization, group robustness, and better interpretability.\nFor example, PerceptionCLIP with ViT-L/14 improves the worst group accuracy by\n16.5% on the Waterbirds dataset and by 3.5% on CelebA.\n","authors":["Bang An","Sicheng Zhu","Michael-Andrei Panaitescu-Liess","Chaithanya Kumar Mummadi","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2308.01313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01300v1","updated":"2023-08-02T17:39:30Z","published":"2023-08-02T17:39:30Z","title":"Revisiting DETR Pre-training for Object Detection","summary":"  Motivated by that DETR-based approaches have established new records on COCO\ndetection and segmentation benchmarks, many recent endeavors show increasing\ninterest in how to further improve DETR-based approaches by pre-training the\nTransformer in a self-supervised manner while keeping the backbone frozen. Some\nstudies already claimed significant improvements in accuracy. In this paper, we\ntake a closer look at their experimental methodology and check if their\napproaches are still effective on the very recent state-of-the-art such as\n$\\mathcal{H}$-Deformable-DETR. We conduct thorough experiments on COCO object\ndetection tasks to study the influence of the choice of pre-training datasets,\nlocalization, and classification target generation schemes. Unfortunately, we\nfind the previous representative self-supervised approach such as DETReg, fails\nto boost the performance of the strong DETR-based approaches on full data\nregimes. We further analyze the reasons and find that simply combining a more\naccurate box predictor and Objects$365$ benchmark can significantly improve the\nresults in follow-up experiments. We demonstrate the effectiveness of our\napproach by achieving strong object detection results of AP=$59.3\\%$ on COCO\nval set, which surpasses $\\mathcal{H}$-Deformable-DETR + Swin-L by +$1.4\\%$.\nLast, we generate a series of synthetic pre-training datasets by combining the\nvery recent image-to-text captioning models (LLaVA) and text-to-image\ngenerative models (SDXL). Notably, pre-training on these synthetic datasets\nleads to notable improvements in object detection performance. Looking ahead,\nwe anticipate substantial advantages through the future expansion of the\nsynthetic pre-training dataset.\n","authors":["Yan Ma","Weicong Liang","Yiduo Hao","Bohan Chen","Xiangyu Yue","Chao Zhang","Yuhui Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.01300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01940v2","updated":"2023-08-02T16:55:29Z","published":"2023-06-02T22:47:18Z","title":"Sampling binary sparse coding QUBO models using a spiking neuromorphic\n  processor","summary":"  We consider the problem of computing a sparse binary representation of an\nimage. To be precise, given an image and an overcomplete, non-orthonormal\nbasis, we aim to find a sparse binary vector indicating the minimal set of\nbasis vectors that when added together best reconstruct the given input. We\nformulate this problem with an $L_2$ loss on the reconstruction error, and an\n$L_0$ (or, equivalently, an $L_1$) loss on the binary vector enforcing\nsparsity. This yields a so-called Quadratic Unconstrained Binary Optimization\n(QUBO) problem, whose solution is generally NP-hard to find. The contribution\nof this work is twofold. First, the method of unsupervised and unnormalized\ndictionary feature learning for a desired sparsity level to best match the data\nis presented. Second, the binary sparse coding problem is then solved on the\nLoihi 1 neuromorphic chip by the use of stochastic networks of neurons to\ntraverse the non-convex energy landscape. The solutions are benchmarked against\nthe classical heuristic simulated annealing. We demonstrate neuromorphic\ncomputing is suitable for sampling low energy solutions of binary sparse coding\nQUBO models, and although Loihi 1 is capable of sampling very sparse solutions\nof the QUBO models, there needs to be improvement in the implementation in\norder to be competitive with simulated annealing.\n","authors":["Kyle Henke","Elijah Pelofske","Georg Hahn","Garrett T. Kenyon"],"pdf_url":"https://arxiv.org/pdf/2306.01940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01262v1","updated":"2023-08-02T16:30:18Z","published":"2023-08-02T16:30:18Z","title":"Incorporating Season and Solar Specificity into Renderings made by a\n  NeRF Architecture using Satellite Images","summary":"  As a result of Shadow NeRF and Sat-NeRF, it is possible to take the solar\nangle into account in a NeRF-based framework for rendering a scene from a novel\nviewpoint using satellite images for training. Our work extends those\ncontributions and shows how one can make the renderings season-specific. Our\nmain challenge was creating a Neural Radiance Field (NeRF) that could render\nseasonal features independently of viewing angle and solar angle while still\nbeing able to render shadows. We teach our network to render seasonal features\nby introducing one more input variable -- time of the year. However, the small\ntraining datasets typical of satellite imagery can introduce ambiguities in\ncases where shadows are present in the same location for every image of a\nparticular season. We add additional terms to the loss function to discourage\nthe network from using seasonal features for accounting for shadows. We show\nthe performance of our network on eight Areas of Interest containing images\ncaptured by the Maxar WorldView-3 satellite. This evaluation includes tests\nmeasuring the ability of our framework to accurately render novel views,\ngenerate height maps, predict shadows, and specify seasonal features\nindependently from shadows. Our ablation studies justify the choices made for\nnetwork design parameters.\n","authors":["Michael Gableman","Avinash Kak"],"pdf_url":"https://arxiv.org/pdf/2308.01262v1.pdf","comment":"18 pages, 17 figures, 10 tables"},{"id":"http://arxiv.org/abs/2308.01256v1","updated":"2023-08-02T16:26:54Z","published":"2023-08-02T16:26:54Z","title":"Learning Spatial Distribution of Long-Term Trackers Scores","summary":"  Long-Term tracking is a hot topic in Computer Vision. In this context,\ncompetitive models are presented every year, showing a constant growth rate in\nperformances, mainly measured in standardized protocols as Visual Object\nTracking (VOT) and Object Tracking Benchmark (OTB). Fusion-trackers strategy\nhas been applied over last few years for overcoming the known re-detection\nproblem, turning out to be an important breakthrough. Following this approach,\nthis work aims to generalize the fusion concept to an arbitrary number of\ntrackers used as baseline trackers in the pipeline, leveraging a learning phase\nto better understand how outcomes correlate with each other, even when no\ntarget is present. A model and data independence conjecture will be evidenced\nin the manuscript, yielding a recall of 0.738 on LTB-50 dataset when learning\nfrom VOT-LT2022, and 0.619 by reversing the two datasets. In both cases,\nresults are strongly competitive with state-of-the-art and recall turns out to\nbe the first on the podium.\n","authors":["Vincenzo Mariano Scarrica","Antonino Staiano"],"pdf_url":"https://arxiv.org/pdf/2308.01256v1.pdf","comment":"20 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.01251v1","updated":"2023-08-02T16:11:51Z","published":"2023-08-02T16:11:51Z","title":"A Hyper-pixel-wise Contrastive Learning Augmented Segmentation Network\n  for Old Landslide Detection Using High-Resolution Remote Sensing Images and\n  Digital Elevation Model Data","summary":"  As a harzard disaster, landslide often brings tremendous losses to humanity,\nso it's necessary to achieve reliable detection of landslide. However, the\nproblems of visual blur and small-sized dataset cause great challenges for old\nlandslide detection task when using remote sensing data. To reliably extract\nsemantic features, a hyper-pixel-wise contrastive learning augmented\nsegmentation network (HPCL-Net) is proposed, which augments the local salient\nfeature extraction from the boundaries of landslides through HPCL and fuses the\nheterogeneous infromation in the semantic space from High-Resolution Remote\nSensing Images and Digital Elevation Model Data data. For full utilization of\nthe precious samples, a global hyper-pixel-wise sample pair queues-based\ncontrastive learning method, which includes the construction of global queues\nthat store hyper-pixel-wise samples and the updating scheme of a momentum\nencoder, is developed, reliably enhancing the extraction ability of semantic\nfeatures. The proposed HPCL-Net is evaluated on a Loess Plateau old landslide\ndataset and experiment results show that the model greatly improves the\nreliablity of old landslide detection compared to the previous old landslide\nsegmentation model, where mIoU metric is increased from 0.620 to 0.651,\nLandslide IoU metric is increased from 0.334 to 0.394 and F1-score metric is\nincreased from 0.501 to 0.565.\n","authors":["Yiming Zhou","Yuexing Peng","Wei Li","Junchuan Yu","Daqing Ge","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2308.01251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00135v2","updated":"2023-08-02T16:11:47Z","published":"2023-07-22T17:05:47Z","title":"InFusion: Inject and Attention Fusion for Multi Concept Zero Shot Text\n  based Video Editing","summary":"  Large text-to-image diffusion models have achieved remarkable success in\ngenerating diverse high-quality images in alignment with text prompt used for\nediting the input image. But, when these models applied to video the main\nchallenge is to ensure temporal consistency and coherence across frames. In\nthis paper, we proposed InFusion, a framework for zero-shot text-based video\nediting leveraging large pre-trained image diffusion models. Our framework\nspecifically supports editing of multiple concepts with the pixel level control\nover diverse concepts mentioned in the editing prompt. Specifically, we inject\nthe difference of features obtained with source and edit prompt from U-Net\nresidual blocks in decoder layers, this when combined with injected attention\nfeatures make it feasible to query the source contents and scale edited\nconcepts along with the injection of unedited parts. The editing is further\ncontrolled in fine-grained manner with mask extraction and attention fusion\nstrategy which cuts the edited part from source and paste it into the denoising\npipeline for editing prompt. Our framework is a low cost alternative of\none-shot tuned models for editing since it does not require training. We\ndemonstrated the complex concept editing with generalised image model (Stable\nDiffusion v1.5) using LoRA. Adaptation is compatible with all the existing\nimage diffusion techniques. Extensive experimental results demonstrate the\neffectiveness over existing methods in rendering high-quality and temporally\nconsistent videos.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2308.00135v2.pdf","comment":"10 pages, 8 figures, 1 Table"},{"id":"http://arxiv.org/abs/2303.15823v3","updated":"2023-08-02T16:04:47Z","published":"2023-03-28T08:51:15Z","title":"Automated wildlife image classification: An active learning tool for\n  ecological applications","summary":"  Wildlife camera trap images are being used extensively to investigate animal\nabundance, habitat associations, and behavior, which is complicated by the fact\nthat experts must first classify the images manually. Artificial intelligence\nsystems can take over this task but usually need a large number of\nalready-labeled training images to achieve sufficient performance. This\nrequirement necessitates human expert labor and poses a particular challenge\nfor projects with few cameras or short durations. We propose a label-efficient\nlearning strategy that enables researchers with small or medium-sized image\ndatabases to leverage the potential of modern machine learning, thus freeing\ncrucial resources for subsequent analyses.\n  Our methodological proposal is two-fold: (1) We improve current strategies of\ncombining object detection and image classification by tuning the\nhyperparameters of both models. (2) We provide an active learning (AL) system\nthat allows training deep learning models very efficiently in terms of required\nhuman-labeled training images. We supply a software package that enables\nresearchers to use these methods directly and thereby ensure the broad\napplicability of the proposed framework in ecological practice.\n  We show that our tuning strategy improves predictive performance. We\ndemonstrate how the AL pipeline reduces the amount of pre-labeled data needed\nto achieve a specific predictive performance and that it is especially valuable\nfor improving out-of-sample predictive performance.\n  We conclude that the combination of tuning and AL increases predictive\nperformance substantially. Furthermore, we argue that our work can broadly\nimpact the community through the ready-to-use software package provided.\nFinally, the publication of our models tailored to European wildlife data\nenriches existing model bases mostly trained on data from Africa and North\nAmerica.\n","authors":["Ludwig Bothmann","Lisa Wimmer","Omid Charrakh","Tobias Weber","Hendrik Edelhoff","Wibke Peters","Hien Nguyen","Caryl Benjamin","Annette Menzel"],"pdf_url":"https://arxiv.org/pdf/2303.15823v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01248v1","updated":"2023-08-02T16:02:42Z","published":"2023-08-02T16:02:42Z","title":"A Hybrid Approach To Real-Time Multi-Object Tracking","summary":"  Multi-Object Tracking, also known as Multi-Target Tracking, is a significant\narea of computer vision that has many uses in a variety of settings. The\ndevelopment of deep learning, which has encouraged researchers to propose more\nand more work in this direction, has significantly impacted the scientific\nadvancement around the study of tracking as well as many other domains related\nto computer vision. In fact, all of the solutions that are currently\nstate-of-the-art in the literature and in the tracking industry, are built on\ntop of deep learning methodologies that produce exceptionally good results.\nDeep learning is enabled thanks to the ever more powerful technology\nresearchers can use to handle the significant computational resources demanded\nby these models. However, when real-time is a main requirement, developing a\ntracking system without being constrained by expensive hardware support with\nenormous computational resources is necessary to widen tracking applications in\nreal-world contexts. To this end, a compromise is to combine powerful deep\nstrategies with more traditional approaches to favor considerably lower\nprocessing solutions at the cost of less accurate tracking results even though\nsuitable for real-time domains. Indeed, the present work goes in that\ndirection, proposing a hybrid strategy for real-time multi-target tracking that\ncombines effectively a classical optical flow algorithm with a deep learning\narchitecture, targeted to a human-crowd tracking system exhibiting a desirable\ntrade-off between performance in tracking precision and computational costs.\nThe developed architecture was experimented with different settings, and\nyielded a MOTA of 0.608 out of the compared state-of-the-art 0.549 results, and\nabout half the running time when introducing the optical flow phase, achieving\nalmost the same performance in terms of accuracy.\n","authors":["Vincenzo Mariano Scarrica","Ciro Panariello","Alessio Ferone","Antonino Staiano"],"pdf_url":"https://arxiv.org/pdf/2308.01248v1.pdf","comment":"11 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.01246v1","updated":"2023-08-02T16:00:39Z","published":"2023-08-02T16:00:39Z","title":"Tirtha -- An Automated Platform to Crowdsource Images and Create 3D\n  Models of Heritage Sites","summary":"  Digital preservation of Cultural Heritage (CH) sites is crucial to protect\nthem against damage from natural disasters or human activities. Creating 3D\nmodels of CH sites has become a popular method of digital preservation thanks\nto advancements in computer vision and photogrammetry. However, the process is\ntime-consuming, expensive, and typically requires specialized equipment and\nexpertise, posing challenges in resource-limited developing countries.\nAdditionally, the lack of an open repository for 3D models hinders research and\npublic engagement with their heritage. To address these issues, we propose\nTirtha, a web platform for crowdsourcing images of CH sites and creating their\n3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and\nMulti-View Stereo (MVS) techniques. It is modular, extensible and\ncost-effective, allowing for the incorporation of new techniques as\nphotogrammetry advances. Tirtha is accessible through a web interface at\nhttps://tirtha.niser.ac.in and can be deployed on-premise or in a cloud\nenvironment. In our case studies, we demonstrate the pipeline's effectiveness\nby creating 3D models of temples in Odisha, India, using crowdsourced images.\nThese models are available for viewing, interaction, and download on the Tirtha\nwebsite. Our work aims to provide a dataset of crowdsourced images and 3D\nreconstructions for research in computer vision, heritage conservation, and\nrelated domains. Overall, Tirtha is a step towards democratizing digital\npreservation, primarily in resource-limited developing countries.\n","authors":["Jyotirmaya Shivottam","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.01246v1.pdf","comment":"Accepted at The 28th International ACM Conference on 3D Web\n  Technology (Web3D 2023)"},{"id":"http://arxiv.org/abs/2308.01239v1","updated":"2023-08-02T15:54:00Z","published":"2023-08-02T15:54:00Z","title":"CMUNeXt: An Efficient Medical Image Segmentation Network based on Large\n  Kernel and Skip Fusion","summary":"  The U-shaped architecture has emerged as a crucial paradigm in the design of\nmedical image segmentation networks. However, due to the inherent local\nlimitations of convolution, a fully convolutional segmentation network with\nU-shaped architecture struggles to effectively extract global context\ninformation, which is vital for the precise localization of lesions. While\nhybrid architectures combining CNNs and Transformers can address these issues,\ntheir application in real medical scenarios is limited due to the computational\nresource constraints imposed by the environment and edge devices. In addition,\nthe convolutional inductive bias in lightweight networks adeptly fits the\nscarce medical data, which is lacking in the Transformer based network. In\norder to extract global context information while taking advantage of the\ninductive bias, we propose CMUNeXt, an efficient fully convolutional\nlightweight medical image segmentation network, which enables fast and accurate\nauxiliary diagnosis in real scene scenarios. CMUNeXt leverages large kernel and\ninverted bottleneck design to thoroughly mix distant spatial and location\ninformation, efficiently extracting global context information. We also\nintroduce the Skip-Fusion block, designed to enable smooth skip-connections and\nensure ample feature fusion. Experimental results on multiple medical image\ndatasets demonstrate that CMUNeXt outperforms existing heavyweight and\nlightweight medical image segmentation networks in terms of segmentation\nperformance, while offering a faster inference speed, lighter weights, and a\nreduced computational cost. The code is available at\nhttps://github.com/FengheTan9/CMUNeXt.\n","authors":["Fenghe Tang","Jianrui Ding","Lingtao Wang","Chunping Ning","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.01239v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.01236v1","updated":"2023-08-02T15:44:36Z","published":"2023-08-02T15:44:36Z","title":"Grounded Image Text Matching with Mismatched Relation Reasoning","summary":"  This paper introduces Grounded Image Text Matching with Mismatched Relation\n(GITM-MR), a novel visual-linguistic joint task that evaluates the relation\nunderstanding capabilities of transformer-based pre-trained models. GITM-MR\nrequires a model to first determine if an expression describes an image, then\nlocalize referred objects or ground the mismatched parts of the text. We\nprovide a benchmark for evaluating pre-trained models on this task, with a\nfocus on the challenging settings of limited data and out-of-distribution\nsentence lengths. Our evaluation demonstrates that pre-trained models lack data\nefficiency and length generalization ability. To address this, we propose the\nRelation-sensitive Correspondence Reasoning Network (RCRN), which incorporates\nrelation-aware reasoning via bi-directional message propagation guided by\nlanguage structure. RCRN can be interpreted as a modular program and delivers\nstrong performance in both length generalization and data efficiency.\n","authors":["Yu Wu","Yana Wei","Haozhe Wang","Yongfei Liu","Sibei Yang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2308.01236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01584v2","updated":"2023-08-02T15:38:37Z","published":"2023-03-02T21:16:53Z","title":"Evolutionary Augmentation Policy Optimization for Self-supervised\n  Learning","summary":"  Self-supervised Learning (SSL) is a machine learning algorithm for\npretraining Deep Neural Networks (DNNs) without requiring manually labeled\ndata. The central idea of this learning technique is based on an auxiliary\nstage aka pretext task in which labeled data are created automatically through\ndata augmentation and exploited for pretraining the DNN. However, the effect of\neach pretext task is not well studied or compared in the literature. In this\npaper, we study the contribution of augmentation operators on the performance\nof self supervised learning algorithms in a constrained settings. We propose an\nevolutionary search method for optimization of data augmentation pipeline in\npretext tasks and measure the impact of augmentation operators in several SOTA\nSSL algorithms. By encoding different combination of augmentation operators in\nchromosomes we seek the optimal augmentation policies through an evolutionary\noptimization mechanism. We further introduce methods for analyzing and\nexplaining the performance of optimized SSL algorithms. Our results indicate\nthat our proposed method can find solutions that outperform the accuracy of\nclassification of SSL algorithms which confirms the influence of augmentation\npolicy choice on the overall performance of SSL algorithms. We also compare\noptimal SSL solutions found by our evolutionary search mechanism and show the\neffect of batch size in the pretext task on two visual datasets.\n","authors":["Noah Barrett","Zahra Sadeghi","Stan Matwin"],"pdf_url":"https://arxiv.org/pdf/2303.01584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01217v1","updated":"2023-08-02T15:22:00Z","published":"2023-08-02T15:22:00Z","title":"TeachCLIP: Multi-Grained Teaching for Efficient Text-to-Video Retrieval","summary":"  For text-to-video retrieval (T2VR), which aims to retrieve unlabeled videos\nby ad-hoc textual queries, CLIP-based methods are dominating. Compared to\nCLIP4Clip which is efficient and compact, the state-of-the-art models tend to\ncompute video-text similarity by fine-grained cross-modal feature interaction\nand matching, putting their scalability for large-scale T2VR into doubt. For\nefficient T2VR, we propose TeachCLIP with multi-grained teaching to let a\nCLIP4Clip based student network learn from more advanced yet computationally\nheavy models such as X-CLIP, TS2-Net and X-Pool . To improve the student's\nlearning capability, we add an Attentional frame-Feature Aggregation (AFA)\nblock, which by design adds no extra storage/computation overhead at the\nretrieval stage. While attentive weights produced by AFA are commonly used for\ncombining frame-level features, we propose a novel use of the weights to let\nthem imitate frame-text relevance estimated by the teacher network. As such,\nAFA provides a fine-grained learning (teaching) channel for the student\n(teacher). Extensive experiments on multiple public datasets justify the\nviability of the proposed method.\n","authors":["Kaibin Tian","Ruixiang Zhao","Hu Hu","Runquan Xie","Fengzong Lian","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2308.01217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01194v1","updated":"2023-08-02T15:03:41Z","published":"2023-08-02T15:03:41Z","title":"Improving Generalization in Visual Reinforcement Learning via\n  Conflict-aware Gradient Agreement Augmentation","summary":"  Learning a policy with great generalization to unseen environments remains\nchallenging but critical in visual reinforcement learning. Despite the success\nof augmentation combination in the supervised learning generalization, naively\napplying it to visual RL algorithms may damage the training efficiency,\nsuffering from serve performance degradation. In this paper, we first conduct\nqualitative analysis and illuminate the main causes: (i) high-variance gradient\nmagnitudes and (ii) gradient conflicts existed in various augmentation methods.\nTo alleviate these issues, we propose a general policy gradient optimization\nframework, named Conflict-aware Gradient Agreement Augmentation (CG2A), and\nbetter integrate augmentation combination into visual RL algorithms to address\nthe generalization bias. In particular, CG2A develops a Gradient Agreement\nSolver to adaptively balance the varying gradient magnitudes, and introduces a\nSoft Gradient Surgery strategy to alleviate the gradient conflicts. Extensive\nexperiments demonstrate that CG2A significantly improves the generalization\nperformance and sample efficiency of visual RL algorithms.\n","authors":["Siao Liu","Zhaoyu Chen","Yang Liu","Yuzheng Wang","Dingkang Yang","Zhile Zhao","Ziqing Zhou","Xie Yi","Wei Li","Wenqiang Zhang","Zhongxue Gan"],"pdf_url":"https://arxiv.org/pdf/2308.01194v1.pdf","comment":"accepted by iccv2023"},{"id":"http://arxiv.org/abs/2308.01189v1","updated":"2023-08-02T14:53:43Z","published":"2023-08-02T14:53:43Z","title":"Data-Centric Diet: Effective Multi-center Dataset Pruning for Medical\n  Image Segmentation","summary":"  This paper seeks to address the dense labeling problems where a significant\nfraction of the dataset can be pruned without sacrificing much accuracy. We\nobserve that, on standard medical image segmentation benchmarks, the loss\ngradient norm-based metrics of individual training examples applied in image\nclassification fail to identify the important samples. To address this issue,\nwe propose a data pruning method by taking into consideration the training\ndynamics on target regions using Dynamic Average Dice (DAD) score. To the best\nof our knowledge, we are among the first to address the data importance in\ndense labeling tasks in the field of medical image analysis, making the\nfollowing contributions: (1) investigating the underlying causes with rigorous\nempirical analysis, and (2) determining effective data pruning approach in\ndense labeling problems. Our solution can be used as a strong yet simple\nbaseline to select important examples for medical image segmentation with\ncombined data sources.\n","authors":["Yongkang He","Mingjin Chen","Zhijing Yang","Yongyi Lu"],"pdf_url":"https://arxiv.org/pdf/2308.01189v1.pdf","comment":"Accepted by ICML workshops 2023"},{"id":"http://arxiv.org/abs/2308.01184v1","updated":"2023-08-02T14:48:25Z","published":"2023-08-02T14:48:25Z","title":"Generative Noisy-Label Learning by Implicit Dicriminative Approximation\n  with Partial Label Prior","summary":"  The learning with noisy labels has been addressed with both discriminative\nand generative models. Although discriminative models have dominated the field\ndue to their simpler modeling and more efficient computational training\nprocesses, generative models offer a more effective means of disentangling\nclean and noisy labels and improving the estimation of the label transition\nmatrix. However, generative approaches maximize the joint likelihood of noisy\nlabels and data using a complex formulation that only indirectly optimizes the\nmodel of interest associating data and clean labels. Additionally, these\napproaches rely on generative models that are challenging to train and tend to\nuse uninformative clean label priors. In this paper, we propose a new\ngenerative noisy-label learning approach that addresses these three issues.\nFirst, we propose a new model optimisation that directly associates data and\nclean labels. Second, the generative model is implicitly estimated using a\ndiscriminative model, eliminating the inefficient training of a generative\nmodel. Third, we propose a new informative label prior inspired by partial\nlabel learning as supervision signal for noisy label learning. Extensive\nexperiments on several noisy-label benchmarks demonstrate that our generative\nmodel provides state-of-the-art results while maintaining a similar\ncomputational complexity as discriminative models.\n","authors":["Fengbei Liu","Yuanhong Chen","Chong Wang","Yuyuan Liu","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2308.01184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01180v1","updated":"2023-08-02T14:43:08Z","published":"2023-08-02T14:43:08Z","title":"Interpretable End-to-End Driving Model for Implicit Scene Understanding","summary":"  Driving scene understanding is to obtain comprehensive scene information\nthrough the sensor data and provide a basis for downstream tasks, which is\nindispensable for the safety of self-driving vehicles. Specific perception\ntasks, such as object detection and scene graph generation, are commonly used.\nHowever, the results of these tasks are only equivalent to the characterization\nof sampling from high-dimensional scene features, which are not sufficient to\nrepresent the scenario. In addition, the goal of perception tasks is\ninconsistent with human driving that just focuses on what may affect the\nego-trajectory. Therefore, we propose an end-to-end Interpretable Implicit\nDriving Scene Understanding (II-DSU) model to extract implicit high-dimensional\nscene features as scene understanding results guided by a planning module and\nto validate the plausibility of scene understanding using auxiliary perception\ntasks for visualization. Experimental results on CARLA benchmarks show that our\napproach achieves the new state-of-the-art and is able to obtain scene features\nthat embody richer scene information relevant to driving, enabling superior\nperformance of the downstream planning.\n","authors":["Yiyang Sun","Xiaonian Wang","Yangyang Zhang","Jiagui Tang","Xiaqiang Tang","Jing Yao"],"pdf_url":"https://arxiv.org/pdf/2308.01180v1.pdf","comment":"Accepted by 26th IEEE International Conference on Intelligent\n  Transportation Systems (ITSC 2023)"},{"id":"http://arxiv.org/abs/2308.01175v1","updated":"2023-08-02T14:29:10Z","published":"2023-08-02T14:29:10Z","title":"Memory Encoding Model","summary":"  We explore a new class of brain encoding model by adding memory-related\ninformation as input. Memory is an essential brain mechanism that works\nalongside visual stimuli. During a vision-memory cognitive task, we found the\nnon-visual brain is largely predictable using previously seen images. Our\nMemory Encoding Model (Mem) won the Algonauts 2023 visual brain competition\neven without model ensemble (single model score 66.8, ensemble score 70.8). Our\nensemble model without memory input (61.4) can also stand a 3rd place.\nFurthermore, we observe periodic delayed brain response correlated to 6th-7th\nprior image, and hippocampus also showed correlated activity timed with this\nperiodicity. We conjuncture that the periodic replay could be related to memory\nmechanism to enhance the working memory.\n","authors":["Huzheng Yang","James Gee","Jianbo Shi"],"pdf_url":"https://arxiv.org/pdf/2308.01175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02058v2","updated":"2023-08-02T14:21:18Z","published":"2023-03-03T16:28:22Z","title":"3D-Aware Object Localization using Gaussian Implicit Occupancy Function","summary":"  To automatically localize a target object in an image is crucial for many\ncomputer vision applications. To represent the 2D object, ellipse labels have\nrecently been identified as a promising alternative to axis-aligned bounding\nboxes. This paper further considers 3D-aware ellipse labels, \\textit{i.e.},\nellipses which are projections of a 3D ellipsoidal approximation of the object,\nfor 2D target localization. Indeed, projected ellipses carry more geometric\ninformation about the object geometry and pose (3D awareness) than traditional\n3D-agnostic bounding box labels. Moreover, such a generic 3D ellipsoidal model\nallows for approximating known to coarsely known targets. We then propose to\nhave a new look at ellipse regression and replace the discontinuous geometric\nellipse parameters with the parameters of an implicit Gaussian distribution\nencoding object occupancy in the image. The models are trained to regress the\nvalues of this bivariate Gaussian distribution over the image pixels using a\nstatistical loss function. We introduce a novel non-trainable differentiable\nlayer, E-DSNT, to extract the distribution parameters. Also, we describe how to\nreadily generate consistent 3D-aware Gaussian occupancy parameters using only\ncoarse dimensions of the target and relative pose labels. We extend three\nexisting spacecraft pose estimation datasets with 3D-aware Gaussian occupancy\nlabels to validate our hypothesis. Labels and source code are publicly\naccessible here: https://cvi2.uni.lu/3d-aware-obj-loc/.\n","authors":["Vincent Gaudillière","Leo Pauly","Arunkumar Rathinam","Albert Garcia Sanchez","Mohamed Adel Musallam","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2303.02058v2.pdf","comment":"6 pages, 5 figures, IROS 2023"},{"id":"http://arxiv.org/abs/2307.16865v2","updated":"2023-08-02T14:02:35Z","published":"2023-07-31T17:21:23Z","title":"Universal Adversarial Defense in Remote Sensing Based on Pre-trained\n  Denoising Diffusion Models","summary":"  Deep neural networks (DNNs) have achieved tremendous success in many remote\nsensing (RS) applications, in which DNNs are vulnerable to adversarial\nperturbations. Unfortunately, current adversarial defense approaches in RS\nstudies usually suffer from performance fluctuation and unnecessary re-training\ncosts due to the need for prior knowledge of the adversarial perturbations\namong RS data. To circumvent these challenges, we propose a universal\nadversarial defense approach in RS imagery (UAD-RS) using pre-trained diffusion\nmodels to defend the common DNNs against multiple unknown adversarial attacks.\nSpecifically, the generative diffusion models are first pre-trained on\ndifferent RS datasets to learn generalized representations in various data\ndomains. After that, a universal adversarial purification framework is\ndeveloped using the forward and reverse process of the pre-trained diffusion\nmodels to purify the perturbations from adversarial samples. Furthermore, an\nadaptive noise level selection (ANLS) mechanism is built to capture the optimal\nnoise level of the diffusion model that can achieve the best purification\nresults closest to the clean samples according to their Frechet Inception\nDistance (FID) in deep feature space. As a result, only a single pre-trained\ndiffusion model is needed for the universal purification of adversarial samples\non each dataset, which significantly alleviates the re-training efforts and\nmaintains high performance without prior knowledge of the adversarial\nperturbations. Experiments on four heterogeneous RS datasets regarding scene\nclassification and semantic segmentation verify that UAD-RS outperforms\nstate-of-the-art adversarial purification approaches with a universal defense\nagainst seven commonly existing adversarial perturbations. Codes and the\npre-trained models are available online (https://github.com/EricYu97/UAD-RS).\n","authors":["Weikang Yu","Yonghao Xu","Pedram Ghamisi"],"pdf_url":"https://arxiv.org/pdf/2307.16865v2.pdf","comment":"Added the GitHub link to the abstract"},{"id":"http://arxiv.org/abs/2308.01147v1","updated":"2023-08-02T13:43:03Z","published":"2023-08-02T13:43:03Z","title":"Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment\n  for Markup-to-Image Generation","summary":"  The recently rising markup-to-image generation poses greater challenges as\ncompared to natural image generation, due to its low tolerance for errors as\nwell as the complex sequence and context correlations between markup and\nrendered image. This paper proposes a novel model named \"Contrast-augmented\nDiffusion Model with Fine-grained Sequence Alignment\" (FSA-CDM), which\nintroduces contrastive positive/negative samples into the diffusion model to\nboost performance for markup-to-image generation. Technically, we design a\nfine-grained cross-modal alignment module to well explore the sequence\nsimilarity between the two modalities for learning robust feature\nrepresentations. To improve the generalization ability, we propose a\ncontrast-augmented diffusion model to explicitly explore positive and negative\nsamples by maximizing a novel contrastive variational objective, which is\nmathematically inferred to provide a tighter bound for the model's\noptimization. Moreover, the context-aware cross attention module is developed\nto capture the contextual information within markup language during the\ndenoising process, yielding better noise prediction results. Extensive\nexperiments are conducted on four benchmark datasets from different domains,\nand the experimental results demonstrate the effectiveness of the proposed\ncomponents in FSA-CDM, significantly exceeding state-of-the-art performance by\nabout 2%-12% DTW improvements. The code will be released at\nhttps://github.com/zgj77/FSACDM.\n","authors":["Guojin Zhong","Jin Yuan","Pan Wang","Kailun Yang","Weili Guan","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2308.01147v1.pdf","comment":"Accepted to ACM MM 2023. The code will be released at\n  https://github.com/zgj77/FSACDM"},{"id":"http://arxiv.org/abs/2308.01146v1","updated":"2023-08-02T13:39:08Z","published":"2023-08-02T13:39:08Z","title":"UCDFormer: Unsupervised Change Detection Using a Transformer-driven\n  Image Translation","summary":"  Change detection (CD) by comparing two bi-temporal images is a crucial task\nin remote sensing. With the advantages of requiring no cumbersome labeled\nchange information, unsupervised CD has attracted extensive attention in the\ncommunity. However, existing unsupervised CD approaches rarely consider the\nseasonal and style differences incurred by the illumination and atmospheric\nconditions in multi-temporal images. To this end, we propose a change detection\nwith domain shift setting for remote sensing images. Furthermore, we present a\nnovel unsupervised CD method using a light-weight transformer, called\nUCDFormer. Specifically, a transformer-driven image translation composed of a\nlight-weight transformer and a domain-specific affinity weight is first\nproposed to mitigate domain shift between two images with real-time efficiency.\nAfter image translation, we can generate the difference map between the\ntranslated before-event image and the original after-event image. Then, a novel\nreliable pixel extraction module is proposed to select significantly\nchanged/unchanged pixel positions by fusing the pseudo change maps of fuzzy\nc-means clustering and adaptive threshold. Finally, a binary change map is\nobtained based on these selected pixel pairs and a binary classifier.\nExperimental results on different unsupervised CD tasks with seasonal and style\nchanges demonstrate the effectiveness of the proposed UCDFormer. For example,\ncompared with several other related methods, UCDFormer improves performance on\nthe Kappa coefficient by more than 12\\%. In addition, UCDFormer achieves\nexcellent performance for earthquake-induced landslide detection when\nconsidering large-scale applications. The code is available at\n\\url{https://github.com/zhu-xlab/UCDFormer}\n","authors":["Qingsong Xu","Yilei Shi","Jianhua Guo","Chaojun Ouyang","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.01146v1.pdf","comment":"16 pages, 7 figures, IEEE Transactions on Geoscience and Remote\n  Sensing"},{"id":"http://arxiv.org/abs/2308.01143v1","updated":"2023-08-02T13:33:20Z","published":"2023-08-02T13:33:20Z","title":"ADS-Cap: A Framework for Accurate and Diverse Stylized Captioning with\n  Unpaired Stylistic Corpora","summary":"  Generating visually grounded image captions with specific linguistic styles\nusing unpaired stylistic corpora is a challenging task, especially since we\nexpect stylized captions with a wide variety of stylistic patterns. In this\npaper, we propose a novel framework to generate Accurate and Diverse Stylized\nCaptions (ADS-Cap). Our ADS-Cap first uses a contrastive learning module to\nalign the image and text features, which unifies paired factual and unpaired\nstylistic corpora during the training process. A conditional variational\nauto-encoder is then used to automatically memorize diverse stylistic patterns\nin latent space and enhance diversity through sampling. We also design a simple\nbut effective recheck module to boost style accuracy by filtering\nstyle-specific captions. Experimental results on two widely used stylized image\ncaptioning datasets show that regarding consistency with the image, style\naccuracy and diversity, ADS-Cap achieves outstanding performances compared to\nvarious baselines. We finally conduct extensive analyses to understand the\neffectiveness of our method. Our code is available at\nhttps://github.com/njucckevin/ADS-Cap.\n","authors":["Kanzhi Cheng","Zheng Ma","Shi Zong","Jianbing Zhang","Xinyu Dai","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01143v1.pdf","comment":"Accepted at Natural Language Processing and Chinese Computing (NLPCC)\n  2022"},{"id":"http://arxiv.org/abs/2308.01140v1","updated":"2023-08-02T13:31:41Z","published":"2023-08-02T13:31:41Z","title":"DySTreSS: Dynamically Scaled Temperature in Self-Supervised Contrastive\n  Learning","summary":"  In contemporary self-supervised contrastive algorithms like SimCLR, MoCo,\netc., the task of balancing attraction between two semantically similar samples\nand repulsion between two samples from different classes is primarily affected\nby the presence of hard negative samples. While the InfoNCE loss has been shown\nto impose penalties based on hardness, the temperature hyper-parameter is the\nkey to regulating the penalties and the trade-off between uniformity and\ntolerance. In this work, we focus our attention to improve the performance of\nInfoNCE loss in SSL by studying the effect of temperature hyper-parameter\nvalues. We propose a cosine similarity-dependent temperature scaling function\nto effectively optimize the distribution of the samples in the feature space.\nWe further analyze the uniformity and tolerance metrics to investigate the\noptimal regions in the cosine similarity space for better optimization.\nAdditionally, we offer a comprehensive examination of the behavior of local and\nglobal structures in the feature space throughout the pre-training phase, as\nthe temperature varies. Experimental evidence shows that the proposed framework\noutperforms or is at par with the contrastive loss-based SSL algorithms. We\nbelieve our work (DySTreSS) on temperature scaling in SSL provides a foundation\nfor future research in contrastive learning.\n","authors":["Siladittya Manna","Soumitri Chattopadhyay","Rakesh Dey","Saumik Bhattacharya","Umapada Pal"],"pdf_url":"https://arxiv.org/pdf/2308.01140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01137v1","updated":"2023-08-02T13:28:44Z","published":"2023-08-02T13:28:44Z","title":"Multi-task learning for classification, segmentation, reconstruction,\n  and detection on chest CT scans","summary":"  Lung cancer and covid-19 have one of the highest morbidity and mortality\nrates in the world. For physicians, the identification of lesions is difficult\nin the early stages of the disease and time-consuming. Therefore, multi-task\nlearning is an approach to extracting important features, such as lesions, from\nsmall amounts of medical data because it learns to generalize better. We\npropose a novel multi-task framework for classification, segmentation,\nreconstruction, and detection. To the best of our knowledge, we are the first\nones who added detection to the multi-task solution. Additionally, we checked\nthe possibility of using two different backbones and different loss functions\nin the segmentation task.\n","authors":["Weronika Hryniewska-Guzik","Maria Kędzierska","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.01137v1.pdf","comment":"presented at the Polish Conference on Artificial Intelligence\n  (PP-RAI), 2023"},{"id":"http://arxiv.org/abs/2308.01136v1","updated":"2023-08-02T13:28:12Z","published":"2023-08-02T13:28:12Z","title":"Leveraging Expert Models for Training Deep Neural Networks in Scarce\n  Data Domains: Application to Offline Handwritten Signature Verification","summary":"  This paper introduces a novel approach to leverage the knowledge of existing\nexpert models for training new Convolutional Neural Networks, on domains where\ntask-specific data are limited or unavailable. The presented scheme is applied\nin offline handwritten signature verification (OffSV) which, akin to other\nbiometric applications, suffers from inherent data limitations due to\nregulatory restrictions. The proposed Student-Teacher (S-T) configuration\nutilizes feature-based knowledge distillation (FKD), combining graph-based\nsimilarity for local activations with global similarity measures to supervise\nstudent's training, using only handwritten text data. Remarkably, the models\ntrained using this technique exhibit comparable, if not superior, performance\nto the teacher model across three popular signature datasets. More importantly,\nthese results are attained without employing any signatures during the feature\nextraction training process. This study demonstrates the efficacy of leveraging\nexisting expert models to overcome data scarcity challenges in OffSV and\npotentially other related domains.\n","authors":["Dimitrios Tsourounis","Ilias Theodorakopoulos","Elias N. Zois","George Economou"],"pdf_url":"https://arxiv.org/pdf/2308.01136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16000v2","updated":"2023-08-02T13:17:34Z","published":"2023-07-29T15:01:27Z","title":"Automated Hit-frame Detection for Badminton Match Analysis","summary":"  Sports professionals constantly under pressure to perform at the highest\nlevel can benefit from sports analysis, which allows coaches and players to\nreduce manual efforts and systematically evaluate their performance using\nautomated tools. This research aims to advance sports analysis in badminton,\nsystematically detecting hit-frames automatically from match videos using\nmodern deep learning techniques. The data included in hit-frames can\nsubsequently be utilized to synthesize players' strokes and on-court movement,\nas well as for other downstream applications such as analyzing training tasks\nand competition strategy. The proposed approach in this study comprises several\nautomated procedures like rally-wise video trimming, player and court keypoints\ndetection, shuttlecock flying direction prediction, and hit-frame detection. In\nthe study, we achieved 99% accuracy on shot angle recognition for video\ntrimming, over 92% accuracy for applying player keypoints sequences on\nshuttlecock flying direction prediction, and reported the evaluation results of\nrally-wise video trimming and hit-frame detection.\n","authors":["Yu-Hang Chien","Fang Yu"],"pdf_url":"https://arxiv.org/pdf/2307.16000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01127v1","updated":"2023-08-02T13:13:18Z","published":"2023-08-02T13:13:18Z","title":"DiffusePast: Diffusion-based Generative Replay for Class Incremental\n  Semantic Segmentation","summary":"  The Class Incremental Semantic Segmentation (CISS) extends the traditional\nsegmentation task by incrementally learning newly added classes. Previous work\nhas introduced generative replay, which involves replaying old class samples\ngenerated from a pre-trained GAN, to address the issues of catastrophic\nforgetting and privacy concerns. However, the generated images lack semantic\nprecision and exhibit out-of-distribution characteristics, resulting in\ninaccurate masks that further degrade the segmentation performance. To tackle\nthese challenges, we propose DiffusePast, a novel framework featuring a\ndiffusion-based generative replay module that generates semantically accurate\nimages with more reliable masks guided by different instructions (e.g., text\nprompts or edge maps). Specifically, DiffusePast introduces a dual-generator\nparadigm, which focuses on generating old class images that align with the\ndistribution of downstream datasets while preserving the structure and layout\nof the original images, enabling more precise masks. To adapt to the novel\nvisual concepts of newly added classes continuously, we incorporate class-wise\ntoken embedding when updating the dual-generator. Moreover, we assign adequate\npseudo-labels of old classes to the background pixels in the new step images,\nfurther mitigating the forgetting of previously learned knowledge. Through\ncomprehensive experiments, our method demonstrates competitive performance\nacross mainstream benchmarks, striking a better balance between the performance\nof old and novel classes.\n","authors":["Jingfan Chen","Yuxi Wang","Pengfei Wang","Xiao Chen","Zhaoxiang Zhang","Zhen Lei","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2308.01127v1.pdf","comment":"e.g.: 13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.01126v1","updated":"2023-08-02T13:09:57Z","published":"2023-08-02T13:09:57Z","title":"Beyond Generic: Enhancing Image Captioning with Real-World Knowledge\n  using Vision-Language Pre-Training Model","summary":"  Current captioning approaches tend to generate correct but \"generic\"\ndescriptions that lack real-world knowledge, e.g., named entities and\ncontextual information. Considering that Vision-Language Pre-Training (VLP)\nmodels master massive such knowledge from large-scale web-harvested data, it is\npromising to utilize the generalizability of VLP models to incorporate\nknowledge into image descriptions. However, using VLP models faces challenges:\nzero-shot inference suffers from knowledge hallucination that leads to\nlow-quality descriptions, but the generic bias in downstream task fine-tuning\nhinders the VLP model from expressing knowledge. To address these concerns, we\npropose a simple yet effective method called Knowledge-guided Replay\n(K-Replay), which enables the retention of pre-training knowledge during\nfine-tuning. Our approach consists of two parts: (1) a knowledge prediction\ntask on automatically collected replay exemplars to continuously awaken the VLP\nmodel's memory about knowledge, thus preventing the model from collapsing into\nthe generic pattern; (2) a knowledge distillation constraint to improve the\nfaithfulness of generated descriptions hence alleviating the knowledge\nhallucination. To evaluate knowledge-enhanced descriptions, we construct a\nnovel captioning benchmark KnowCap, containing knowledge of landmarks, famous\nbrands, special foods and movie characters. Experimental results show that our\napproach effectively incorporates knowledge into descriptions, outperforming\nstrong VLP baseline by 20.9 points (78.7->99.6) in CIDEr score and 20.5\npercentage points (34.0%->54.5%) in knowledge recognition accuracy. Our code\nand data is available at https://github.com/njucckevin/KnowCap.\n","authors":["Kanzhi Cheng","Wenpo Song","Zheng Ma","Wenhao Zhu","Zixuan Zhu","Jianbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01126v1.pdf","comment":"Accepted at ACM Multimedia (ACMMM) 2023"},{"id":"http://arxiv.org/abs/2308.01125v1","updated":"2023-08-02T13:09:12Z","published":"2023-08-02T13:09:12Z","title":"Stereo Visual Odometry with Deep Learning-Based Point and Line Feature\n  Matching using an Attention Graph Neural Network","summary":"  Robust feature matching forms the backbone for most Visual Simultaneous\nLocalization and Mapping (vSLAM), visual odometry, 3D reconstruction, and\nStructure from Motion (SfM) algorithms. However, recovering feature matches\nfrom texture-poor scenes is a major challenge and still remains an open area of\nresearch. In this paper, we present a Stereo Visual Odometry (StereoVO)\ntechnique based on point and line features which uses a novel feature-matching\nmechanism based on an Attention Graph Neural Network that is designed to\nperform well even under adverse weather conditions such as fog, haze, rain, and\nsnow, and dynamic lighting conditions such as nighttime illumination and glare\nscenarios. We perform experiments on multiple real and synthetic datasets to\nvalidate the ability of our method to perform StereoVO under low visibility\nweather and lighting conditions through robust point and line matches. The\nresults demonstrate that our method achieves more line feature matches than\nstate-of-the-art line matching algorithms, which when complemented with point\nfeature matches perform consistently well in adverse weather and dynamic\nlighting conditions.\n","authors":["Shenbagaraj Kannapiran","Nalin Bendapudi","Ming-Yuan Yu","Devarth Parikh","Spring Berman","Ankit Vora","Gaurav Pandey"],"pdf_url":"https://arxiv.org/pdf/2308.01125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10940v2","updated":"2023-08-02T13:04:50Z","published":"2023-06-19T14:00:34Z","title":"TeleViT: Teleconnection-driven Transformers Improve Subseasonal to\n  Seasonal Wildfire Forecasting","summary":"  Wildfires are increasingly exacerbated as a result of climate change,\nnecessitating advanced proactive measures for effective mitigation. It is\nimportant to forecast wildfires weeks and months in advance to plan forest fuel\nmanagement, resource procurement and allocation. To achieve such accurate\nlong-term forecasts at a global scale, it is crucial to employ models that\naccount for the Earth system's inherent spatio-temporal interactions, such as\nmemory effects and teleconnections. We propose a teleconnection-driven vision\ntransformer (TeleViT), capable of treating the Earth as one interconnected\nsystem, integrating fine-grained local-scale inputs with global-scale inputs,\nsuch as climate indices and coarse-grained global variables. Through\ncomprehensive experimentation, we demonstrate the superiority of TeleViT in\naccurately predicting global burned area patterns for various forecasting\nwindows, up to four months in advance. The gain is especially pronounced in\nlarger forecasting windows, demonstrating the improved ability of deep learning\nmodels that exploit teleconnections to capture Earth system dynamics. Code\navailable at https://github.com/Orion-Ai-Lab/TeleViT.\n","authors":["Ioannis Prapas","Nikolaos Ioannis Bountos","Spyros Kondylatos","Dimitrios Michail","Gustau Camps-Valls","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2306.10940v2.pdf","comment":"Accepted at the ICCV 2023 workshop on Artificial Intelligence for\n  Humanitarian Assistance and Disaster Response"},{"id":"http://arxiv.org/abs/2308.01119v1","updated":"2023-08-02T12:59:10Z","published":"2023-08-02T12:59:10Z","title":"Unlearning Spurious Correlations in Chest X-ray Classification","summary":"  Medical image classification models are frequently trained using training\ndatasets derived from multiple data sources. While leveraging multiple data\nsources is crucial for achieving model generalization, it is important to\nacknowledge that the diverse nature of these sources inherently introduces\nunintended confounders and other challenges that can impact both model accuracy\nand transparency. A notable confounding factor in medical image classification,\nparticularly in musculoskeletal image classification, is skeletal\nmaturation-induced bone growth observed during adolescence. We train a deep\nlearning model using a Covid-19 chest X-ray dataset and we showcase how this\ndataset can lead to spurious correlations due to unintended confounding\nregions. eXplanation Based Learning (XBL) is a deep learning approach that goes\nbeyond interpretability by utilizing model explanations to interactively\nunlearn spurious correlations. This is achieved by integrating interactive user\nfeedback, specifically feature annotations. In our study, we employed two\nnon-demanding manual feedback mechanisms to implement an XBL-based approach for\neffectively eliminating these spurious correlations. Our results underscore the\npromising potential of XBL in constructing robust models even in the presence\nof confounding factors.\n","authors":["Misgina Tsighe Hagos","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2308.01119v1.pdf","comment":"Accepted at the Discovery Science 2023 conference. arXiv admin note:\n  text overlap with arXiv:2307.06026"},{"id":"http://arxiv.org/abs/2303.06338v3","updated":"2023-08-02T12:56:53Z","published":"2023-03-11T07:53:15Z","title":"Learning Combinatorial Prompts for Universal Controllable Image\n  Captioning","summary":"  Controllable Image Captioning (CIC) -- generating natural language\ndescriptions about images under the guidance of given control signals -- is one\nof the most promising directions towards next-generation captioning systems.\nTill now, various kinds of control signals for CIC have been proposed, ranging\nfrom content-related control to structure-related control. However, due to the\nformat and target gaps of different control signals, all existing CIC works (or\narchitectures) only focus on one certain control signal, and overlook the\nhuman-like combinatorial ability. By ``combinatorial\", we mean that our humans\ncan easily meet multiple needs (or constraints) simultaneously when generating\ndescriptions. To this end, we propose a novel prompt-based framework for CIC by\nlearning Combinatorial Prompts, dubbed as ComPro. Specifically, we directly\nutilize a pretrained language model GPT-2 as our language model, which can help\nto bridge the gap between different signal-specific CIC architectures. Then, we\nreformulate the CIC as a prompt-guide sentence generation problem, and propose\na new lightweight prompt generation network to generate the combinatorial\nprompts for different kinds of control signals. For different control signals,\nwe further design a new mask attention mechanism to realize the prompt-based\nCIC. Due to its simplicity, our ComPro can be further extended to more kinds of\ncombined control signals by concatenating these prompts. Extensive experiments\non two prevalent CIC benchmarks have verified the effectiveness and efficiency\nof our ComPro on both single and combined control signals.\n","authors":["Zhen Wang","Jun Xiao","Yueting Zhuang","Fei Gao","Jian Shao","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2303.06338v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12730v2","updated":"2023-08-02T12:10:55Z","published":"2023-07-24T12:22:19Z","title":"COCO-O: A Benchmark for Object Detectors under Natural Distribution\n  Shifts","summary":"  Practical object detection application can lose its effectiveness on image\ninputs with natural distribution shifts. This problem leads the research\ncommunity to pay more attention on the robustness of detectors under\nOut-Of-Distribution (OOD) inputs. Existing works construct datasets to\nbenchmark the detector's OOD robustness for a specific application scenario,\ne.g., Autonomous Driving. However, these datasets lack universality and are\nhard to benchmark general detectors built on common tasks such as COCO. To give\na more comprehensive robustness assessment, we introduce\nCOCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of\nnatural distribution shifts. COCO-O has a large distribution gap with training\ndata and results in a significant 55.7% relative performance drop on a Faster\nR-CNN detector. We leverage COCO-O to conduct experiments on more than 100\nmodern object detectors to investigate if their improvements are credible or\njust over-fitting to the COCO test set. Unfortunately, most classic detectors\nin early years do not exhibit strong OOD generalization. We further study the\nrobustness effect on recent breakthroughs of detector's architecture design,\naugmentation and pre-training techniques. Some empirical findings are revealed:\n1) Compared with detection head or neck, backbone is the most important part\nfor robustness; 2) An end-to-end detection transformer design brings no\nenhancement, and may even reduce robustness; 3) Large-scale foundation models\nhave made a great leap on robust object detection. We hope our COCO-O could\nprovide a rich testbed for robustness study of object detection. The dataset\nwill be available at\nhttps://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o.\n","authors":["Xiaofeng Mao","Yuefeng Chen","Yao Zhu","Da Chen","Hang Su","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12730v2.pdf","comment":"Accepted in ICCV2023,\n  https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o"},{"id":"http://arxiv.org/abs/2308.01097v1","updated":"2023-08-02T12:04:28Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":"  Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01095v1","updated":"2023-08-02T11:58:43Z","published":"2023-08-02T11:58:43Z","title":"AutoPoster: A Highly Automatic and Content-aware Design System for\n  Advertising Poster Generation","summary":"  Advertising posters, a form of information presentation, combine visual and\nlinguistic modalities. Creating a poster involves multiple steps and\nnecessitates design experience and creativity. This paper introduces\nAutoPoster, a highly automatic and content-aware system for generating\nadvertising posters. With only product images and titles as inputs, AutoPoster\ncan automatically produce posters of varying sizes through four key stages:\nimage cleaning and retargeting, layout generation, tagline generation, and\nstyle attribute prediction. To ensure visual harmony of posters, two\ncontent-aware models are incorporated for layout and tagline generation.\nMoreover, we propose a novel multi-task Style Attribute Predictor (SAP) to\njointly predict visual style attributes. Meanwhile, to our knowledge, we\npropose the first poster generation dataset that includes visual attribute\nannotations for over 76k posters. Qualitative and quantitative outcomes from\nuser studies and experiments substantiate the efficacy of our system and the\naesthetic superiority of the generated posters compared to other poster\ngeneration methods.\n","authors":["Jinpeng Lin","Min Zhou","Ye Ma","Yifan Gao","Chenxi Fei","Yangjian Chen","Zhang Yu","Tiezheng Ge"],"pdf_url":"https://arxiv.org/pdf/2308.01095v1.pdf","comment":"Accepted for ACM MM 2023"},{"id":"http://arxiv.org/abs/2305.03701v5","updated":"2023-08-02T11:52:16Z","published":"2023-05-05T17:27:21Z","title":"LMEye: An Interactive Perception Network for Large Language Models","summary":"  Training a Large Visual Language Model (LVLM) from scratch, like GPT-4, is\nresource-intensive. Our paper presents a play-and-plug module for Large\nLanguage Models (LLMs), namely Interactive Perception Network (IPN), aiming to\nachieve a LVLM by incorporating the image understanding capability into LLMs.\nPrevious methods incorporate visual information into LLMs with a simple visual\nmapping network, where the image feature is projected into the embedding space\nof LLMs via a linear layer. Such mapping network projects the image feature\nonce yet does not consider the interaction between the image and the human\ninput query. Hence, the obtained visual information with no connections with\nhuman intention may be inadequate for LLMs to make intention-following\nresponses, which we term as static visual information. IPN addresses this issue\nby allowing the LLM to request the desired visual information aligned with\nvarious human instructions, which we term as the dynamic interaction between\nthe LLM and visual information. Specifically, IPN consists of a simple visual\nmapping network to provide the basic perception of an image for LLMs. It also\ncontains additional modules responsible for acquiring requests from LLMs,\nperforming request-based visual information interaction, and transmitting the\nresulting interacted visual information to LLMs, respectively. In this way,\nLLMs act to understand the human query, deliver the corresponding request to\nthe request-based visual information interaction module, and generate the\nresponse based on the interleaved multimodal information. We evaluate IPN\nthrough extensive experiments on multimodal question answering, reasoning, and\nso on, demonstrating that it significantly improves the zero-shot performance\nof LVLMs on various multimodal tasks compared to previous methods.\n","authors":["Yunxin Li","Baotian Hu","Xinyu Chen","Lin Ma","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.03701v5.pdf","comment":"working in progress"},{"id":"http://arxiv.org/abs/2308.01088v1","updated":"2023-08-02T11:44:49Z","published":"2023-08-02T11:44:49Z","title":"Hand tracking for clinical applications: validation of the Google\n  MediaPipe Hand (GMH) and the depth-enhanced GMH-D frameworks","summary":"  Accurate 3D tracking of hand and fingers movements poses significant\nchallenges in computer vision. The potential applications span across multiple\ndomains, including human-computer interaction, virtual reality, industry, and\nmedicine. While gesture recognition has achieved remarkable accuracy,\nquantifying fine movements remains a hurdle, particularly in clinical\napplications where the assessment of hand dysfunctions and rehabilitation\ntraining outcomes necessitate precise measurements. Several novel and\nlightweight frameworks based on Deep Learning have emerged to address this\nissue; however, their performance in accurately and reliably measuring fingers\nmovements requires validation against well-established gold standard systems.\nIn this paper, the aim is to validate the handtracking framework implemented by\nGoogle MediaPipe Hand (GMH) and an innovative enhanced version, GMH-D, that\nexploits the depth estimation of an RGB-Depth camera to achieve more accurate\ntracking of 3D movements. Three dynamic exercises commonly administered by\nclinicians to assess hand dysfunctions, namely Hand Opening-Closing, Single\nFinger Tapping and Multiple Finger Tapping are considered. Results demonstrate\nhigh temporal and spectral consistency of both frameworks with the gold\nstandard. However, the enhanced GMH-D framework exhibits superior accuracy in\nspatial measurements compared to the baseline GMH, for both slow and fast\nmovements. Overall, our study contributes to the advancement of hand tracking\ntechnology, the establishment of a validation procedure as a good-practice to\nprove efficacy of deep-learning-based hand-tracking, and proves the\neffectiveness of GMH-D as a reliable framework for assessing 3D hand movements\nin clinical applications.\n","authors":["Gianluca Amprimo","Giulia Masi","Giuseppe Pettiti","Gabriella Olmo","Lorenzo Priano","Claudia Ferraris"],"pdf_url":"https://arxiv.org/pdf/2308.01088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01086v1","updated":"2023-08-02T11:31:43Z","published":"2023-08-02T11:31:43Z","title":"Homography Estimation in Complex Topological Scenes","summary":"  Surveillance videos and images are used for a broad set of applications,\nranging from traffic analysis to crime detection. Extrinsic camera calibration\ndata is important for most analysis applications. However, security cameras are\nsusceptible to environmental conditions and small camera movements, resulting\nin a need for an automated re-calibration method that can account for these\nvarying conditions. In this paper, we present an automated camera-calibration\nprocess leveraging a dictionary-based approach that does not require prior\nknowledge on any camera settings. The method consists of a custom\nimplementation of a Spatial Transformer Network (STN) and a novel topological\nloss function. Experiments reveal that the proposed method improves the IoU\nmetric by up to 12% w.r.t. a state-of-the-art model across five synthetic\ndatasets and the World Cup 2014 dataset.\n","authors":["Giacomo D'Amicantonio","Egor Bondarau","Peter H. N. De With"],"pdf_url":"https://arxiv.org/pdf/2308.01086v1.pdf","comment":"Will be published in Intelligent Vehicle Symposium 2023"},{"id":"http://arxiv.org/abs/2308.01058v1","updated":"2023-08-02T10:10:25Z","published":"2023-08-02T10:10:25Z","title":"Improving Generalization of Synthetically Trained Sonar Image\n  Descriptors for Underwater Place Recognition","summary":"  Autonomous navigation in underwater environments presents challenges due to\nfactors such as light absorption and water turbidity, limiting the\neffectiveness of optical sensors. Sonar systems are commonly used for\nperception in underwater operations as they are unaffected by these\nlimitations. Traditional computer vision algorithms are less effective when\napplied to sonar-generated acoustic images, while convolutional neural networks\n(CNNs) typically require large amounts of labeled training data that are often\nunavailable or difficult to acquire. To this end, we propose a novel compact\ndeep sonar descriptor pipeline that can generalize to real scenarios while\nbeing trained exclusively on synthetic data. Our architecture is based on a\nResNet18 back-end and a properly parameterized random Gaussian projection\nlayer, whereas input sonar data is enhanced with standard ad-hoc\nnormalization/prefiltering techniques. A customized synthetic data generation\nprocedure is also presented. The proposed method has been evaluated extensively\nusing both synthetic and publicly available real data, demonstrating its\neffectiveness compared to state-of-the-art methods.\n","authors":["Ivano Donadi","Emilio Olivastri","Daniel Fusaro","Wanmeng Li","Daniele Evangelista","Alberto Pretto"],"pdf_url":"https://arxiv.org/pdf/2308.01058v1.pdf","comment":"This paper has been accepted for publication at the 14th\n  International Conference on Computer Vision Systems (ICVS 2023)"},{"id":"http://arxiv.org/abs/2308.01057v1","updated":"2023-08-02T10:10:22Z","published":"2023-08-02T10:10:22Z","title":"MammoDG: Generalisable Deep Learning Breaks the Limits of Cross-Domain\n  Multi-Center Breast Cancer Screening","summary":"  Breast cancer is a major cause of cancer death among women, emphasising the\nimportance of early detection for improved treatment outcomes and quality of\nlife. Mammography, the primary diagnostic imaging test, poses challenges due to\nthe high variability and patterns in mammograms. Double reading of mammograms\nis recommended in many screening programs to improve diagnostic accuracy but\nincreases radiologists' workload. Researchers explore Machine Learning models\nto support expert decision-making. Stand-alone models have shown comparable or\nsuperior performance to radiologists, but some studies note decreased\nsensitivity with multiple datasets, indicating the need for high generalisation\nand robustness models. This work devises MammoDG, a novel deep-learning\nframework for generalisable and reliable analysis of cross-domain multi-center\nmammography data. MammoDG leverages multi-view mammograms and a novel\ncontrastive mechanism to enhance generalisation capabilities. Extensive\nvalidation demonstrates MammoDG's superiority, highlighting the critical\nimportance of domain generalisation for trustworthy mammography analysis in\nimaging protocol variations.\n","authors":["Yijun Yang","Shujun Wang","Lihao Liu","Sarah Hickman","Fiona J Gilbert","Carola-Bibiane Schönlieb","Angelica I. Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2308.01057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v3","updated":"2023-08-02T09:44:43Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models with Explicit Transition Probability","summary":"  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities\nof generated content, however, they often suffer from complex forward\nprocesses, resulting in inefficient solutions for the reversed process and\nprolonged sampling times. In this paper, we aim to address the aforementioned\nchallenges by focusing on the diffusion process itself that we propose to\ndecouple the intricate diffusion process into two comparatively simpler process\nto improve the generative efficacy and speed. In particular, we present a novel\ndiffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito\ndiffusion process, in which the image distribution is approximated by an\nexplicit transition probability while the noise path is controlled by the\nstandard Wiener process. We find that decoupling the diffusion process reduces\nthe learning difficulty and the explicit transition probability improves the\ngenerative speed significantly. We prove a new training objective for DPM,\nwhich enables the model to learn to predict the noise and image components\nseparately. Moreover, given the novel forward diffusion equation, we derive the\nreverse denoising formula of DDM that naturally supports fewer steps of\ngeneration without ordinary differential equation (ODE) based accelerators. Our\nexperiments demonstrate that DDM outperforms previous DPMs by a large margin in\nfewer function evaluations setting and gets comparable performances in long\nfunction evaluations setting. We also show that our framework can be applied to\nimage-conditioned generation and high-resolution image synthesis, and that it\ncan generate high-quality images with only 10 function evaluations.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01045v1","updated":"2023-08-02T09:40:02Z","published":"2023-08-02T09:40:02Z","title":"Dynamic Token Pruning in Plain Vision Transformers for Semantic\n  Segmentation","summary":"  Vision transformers have achieved leading performance on various visual tasks\nyet still suffer from high computational complexity. The situation deteriorates\nin dense prediction tasks like semantic segmentation, as high-resolution inputs\nand outputs usually imply more tokens involved in computations. Directly\nremoving the less attentive tokens has been discussed for the image\nclassification task but can not be extended to semantic segmentation since a\ndense prediction is required for every patch. To this end, this work introduces\na Dynamic Token Pruning (DToP) method based on the early exit of tokens for\nsemantic segmentation. Motivated by the coarse-to-fine segmentation process by\nhumans, we naturally split the widely adopted auxiliary-loss-based network\narchitecture into several stages, where each auxiliary block grades every\ntoken's difficulty level. We can finalize the prediction of easy tokens in\nadvance without completing the entire forward pass. Moreover, we keep $k$\nhighest confidence tokens for each semantic category to uphold the\nrepresentative context information. Thus, computational complexity will change\nwith the difficulty of the input, akin to the way humans do segmentation.\nExperiments suggest that the proposed DToP architecture reduces on average\n$20\\% - 35\\%$ of computational cost for current semantic segmentation methods\nbased on plain vision transformers without accuracy degradation.\n","authors":["Quan Tang","Bowen Zhang","Jiajun Liu","Fagiu Liu","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.01045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03504v2","updated":"2023-08-02T09:39:05Z","published":"2023-06-06T08:50:13Z","title":"Ada-TTA: Towards Adaptive High-Quality Text-to-Talking Avatar Synthesis","summary":"  We are interested in a novel task, namely low-resource text-to-talking\navatar. Given only a few-minute-long talking person video with the audio track\nas the training data and arbitrary texts as the driving input, we aim to\nsynthesize high-quality talking portrait videos corresponding to the input\ntext. This task has broad application prospects in the digital human industry\nbut has not been technically achieved yet due to two challenges: (1) It is\nchallenging to mimic the timbre from out-of-domain audio for a traditional\nmulti-speaker Text-to-Speech system. (2) It is hard to render high-fidelity and\nlip-synchronized talking avatars with limited training data. In this paper, we\nintroduce Adaptive Text-to-Talking Avatar (Ada-TTA), which (1) designs a\ngeneric zero-shot multi-speaker TTS model that well disentangles the text\ncontent, timbre, and prosody; and (2) embraces recent advances in neural\nrendering to achieve realistic audio-driven talking face video generation. With\nthese designs, our method overcomes the aforementioned two challenges and\nachieves to generate identity-preserving speech and realistic talking person\nvideo. Experiments demonstrate that our method could synthesize realistic,\nidentity-preserving, and audio-visual synchronized talking avatar videos.\n","authors":["Zhenhui Ye","Ziyue Jiang","Yi Ren","Jinglin Liu","Chen Zhang","Xiang Yin","Zejun Ma","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.03504v2.pdf","comment":"Accepted by ICML 2023 Workshop, 6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.01042v1","updated":"2023-08-02T09:35:21Z","published":"2023-08-02T09:35:21Z","title":"WCCNet: Wavelet-integrated CNN with Crossmodal Rearranging Fusion for\n  Fast Multispectral Pedestrian Detection","summary":"  Multispectral pedestrian detection achieves better visibility in challenging\nconditions and thus has a broad application in various tasks, for which both\nthe accuracy and computational cost are of paramount importance. Most existing\napproaches treat RGB and infrared modalities equally, typically adopting two\nsymmetrical CNN backbones for multimodal feature extraction, which ignores the\nsubstantial differences between modalities and brings great difficulty for the\nreduction of the computational cost as well as effective crossmodal fusion. In\nthis work, we propose a novel and efficient framework named WCCNet that is able\nto differentially extract rich features of different spectra with lower\ncomputational complexity and semantically rearranges these features for\neffective crossmodal fusion. Specifically, the discrete wavelet transform (DWT)\nallowing fast inference and training speed is embedded to construct a\ndual-stream backbone for efficient feature extraction. The DWT layers of WCCNet\nextract frequency components for infrared modality, while the CNN layers\nextract spatial-domain features for RGB modality. This methodology not only\nsignificantly reduces the computational complexity, but also improves the\nextraction of infrared features to facilitate the subsequent crossmodal fusion.\nBased on the well extracted features, we elaborately design the crossmodal\nrearranging fusion module (CMRF), which can mitigate spatial misalignment and\nmerge semantically complementary features of spatially-related local regions to\namplify the crossmodal complementary information. We conduct comprehensive\nevaluations on KAIST and FLIR benchmarks, in which WCCNet outperforms\nstate-of-the-art methods with considerable computational efficiency and\ncompetitive accuracy. We also perform the ablation study and analyze thoroughly\nthe impact of different components on the performance of WCCNet.\n","authors":["Xingjian Wang","Li Chai","Jiming Chen","Zhiguo Shi"],"pdf_url":"https://arxiv.org/pdf/2308.01042v1.pdf","comment":"Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2308.01035v1","updated":"2023-08-02T09:28:35Z","published":"2023-08-02T09:28:35Z","title":"TS-RGBD Dataset: a Novel Dataset for Theatre Scenes Description for\n  People with Visual Impairments","summary":"  Computer vision was long a tool used for aiding visually impaired people to\nmove around their environment and avoid obstacles and falls. Solutions are\nlimited to either indoor or outdoor scenes, which limits the kind of places and\nscenes visually disabled people can be in, including entertainment places such\nas theatres. Furthermore, most of the proposed computer-vision-based methods\nrely on RGB benchmarks to train their models resulting in a limited performance\ndue to the absence of the depth modality.\n  In this paper, we propose a novel RGB-D dataset containing theatre scenes\nwith ground truth human actions and dense captions annotations for image\ncaptioning and human action recognition: TS-RGBD dataset. It includes three\ntypes of data: RGB, depth, and skeleton sequences, captured by Microsoft\nKinect.\n  We test image captioning models on our dataset as well as some skeleton-based\nhuman action recognition models in order to extend the range of environment\ntypes where a visually disabled person can be, by detecting human actions and\ntextually describing appearances of regions of interest in theatre scenes.\n","authors":["Leyla Benhamida","Khadidja Delloul","Slimane Larabi"],"pdf_url":"https://arxiv.org/pdf/2308.01035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01030v1","updated":"2023-08-02T09:27:11Z","published":"2023-08-02T09:27:11Z","title":"Three Factors to Improve Out-of-Distribution Detection","summary":"  In the problem of out-of-distribution (OOD) detection, the usage of auxiliary\ndata as outlier data for fine-tuning has demonstrated encouraging performance.\nHowever, previous methods have suffered from a trade-off between classification\naccuracy (ACC) and OOD detection performance (AUROC, FPR, AUPR). To improve\nthis trade-off, we make three contributions: (i) Incorporating a self-knowledge\ndistillation loss can enhance the accuracy of the network; (ii) Sampling\nsemi-hard outlier data for training can improve OOD detection performance with\nminimal impact on accuracy; (iii) The introduction of our novel supervised\ncontrastive learning can simultaneously improve OOD detection performance and\nthe accuracy of the network. By incorporating all three factors, our approach\nenhances both accuracy and OOD detection performance by addressing the\ntrade-off between classification and OOD detection. Our method achieves\nimprovements over previous approaches in both performance metrics.\n","authors":["Hyunjun Choi","JaeHo Chung","Hawook Jeong","Jin Young Choi"],"pdf_url":"https://arxiv.org/pdf/2308.01030v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2211.14304v3","updated":"2023-08-02T08:52:44Z","published":"2022-11-25T18:59:03Z","title":"BeLFusion: Latent Diffusion for Behavior-Driven Human Motion Prediction","summary":"  Stochastic human motion prediction (HMP) has generally been tackled with\ngenerative adversarial networks and variational autoencoders. Most prior works\naim at predicting highly diverse movements in terms of the skeleton joints'\ndispersion. This has led to methods predicting fast and motion-divergent\nmovements, which are often unrealistic and incoherent with past motion. Such\nmethods also neglect contexts that need to anticipate diverse low-range\nbehaviors, or actions, with subtle joint displacements. To address these\nissues, we present BeLFusion, a model that, for the first time, leverages\nlatent diffusion models in HMP to sample from a latent space where behavior is\ndisentangled from pose and motion. As a result, diversity is encouraged from a\nbehavioral perspective. Thanks to our behavior coupler's ability to transfer\nsampled behavior to ongoing motion, BeLFusion's predictions display a variety\nof behaviors that are significantly more realistic than the state of the art.\nTo support it, we introduce two metrics, the Area of the Cumulative Motion\nDistribution, and the Average Pairwise Distance Error, which are correlated to\nour definition of realism according to a qualitative study with 126\nparticipants. Finally, we prove BeLFusion's generalization power in a new\ncross-dataset scenario for stochastic HMP.\n","authors":["German Barquero","Sergio Escalera","Cristina Palmero"],"pdf_url":"https://arxiv.org/pdf/2211.14304v3.pdf","comment":"ICCV 2023 Camera-ready version. Project page:\n  https://barquerogerman.github.io/BeLFusion/"},{"id":"http://arxiv.org/abs/2308.01010v1","updated":"2023-08-02T08:32:43Z","published":"2023-08-02T08:32:43Z","title":"Point Anywhere: Directed Object Estimation from Omnidirectional Images","summary":"  One of the intuitive instruction methods in robot navigation is a pointing\ngesture. In this study, we propose a method using an omnidirectional camera to\neliminate the user/object position constraint and the left/right constraint of\nthe pointing arm. Although the accuracy of skeleton and object detection is low\ndue to the high distortion of equirectangular images, the proposed method\nenables highly accurate estimation by repeatedly extracting regions of interest\nfrom the equirectangular image and projecting them onto perspective images.\nFurthermore, we found that training the likelihood of the target object in\nmachine learning further improves the estimation accuracy.\n","authors":["Nanami Kotani","Asako Kanezaki"],"pdf_url":"https://arxiv.org/pdf/2308.01010v1.pdf","comment":"Accepted to SIGGRAPH 2023 Poster. Project page:\n  https://github.com/NKotani/PointAnywhere"},{"id":"http://arxiv.org/abs/2211.11236v2","updated":"2023-08-02T08:32:16Z","published":"2022-11-21T07:59:22Z","title":"Boosting the Transferability of Adversarial Attacks with Global Momentum\n  Initialization","summary":"  Deep neural networks are vulnerable to adversarial examples, which attach\nhuman invisible perturbations to benign inputs. Simultaneously, adversarial\nexamples exhibit transferability under different models, which makes practical\nblack-box attacks feasible. However, existing methods are still incapable of\nachieving desired transfer attack performance. In this work, from the\nperspective of gradient optimization and consistency, we analyze and discover\nthe gradient elimination phenomenon as well as the local momentum optimum\ndilemma. To tackle these issues, we propose Global Momentum Initialization (GI)\nto suppress gradient elimination and help search for the global optimum.\nSpecifically, we perform gradient pre-convergence before the attack and carry\nout a global search during the pre-convergence stage. Our method can be easily\ncombined with almost all existing transfer methods, and we improve the success\nrate of transfer attacks significantly by an average of 6.4% under various\nadvanced defense mechanisms compared to state-of-the-art methods. Eventually,\nwe achieve an attack success rate of 95.4%, fully illustrating the insecurity\nof existing defense mechanisms. Code is available at\n$\\href{https://github.com/Omenzychen/Global-Momentum-Initialization}{this\\\nURL}$.\n","authors":["Jiafeng Wang","Zhaoyu Chen","Kaixun Jiang","Dingkang Yang","Lingyi Hong","Pinxue Guo","Haijing Guo","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.11236v2.pdf","comment":"Revise and release codes"},{"id":"http://arxiv.org/abs/2308.01006v1","updated":"2023-08-02T08:29:44Z","published":"2023-08-02T08:29:44Z","title":"FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of\n  Autonomous Driving","summary":"  Building a multi-modality multi-task neural network toward accurate and\nrobust performance is a de-facto standard in perception task of autonomous\ndriving. However, leveraging such data from multiple sensors to jointly\noptimize the prediction and planning tasks remains largely unexplored. In this\npaper, we present FusionAD, to the best of our knowledge, the first unified\nframework that fuse the information from two most critical sensors, camera and\nLiDAR, goes beyond perception task. Concretely, we first build a transformer\nbased multi-modality fusion network to effectively produce fusion based\nfeatures. In constrast to camera-based end-to-end method UniAD, we then\nestablish a fusion aided modality-aware prediction and status-aware planning\nmodules, dubbed FMSPnP that take advantages of multi-modality features. We\nconduct extensive experiments on commonly used benchmark nuScenes dataset, our\nFusionAD achieves state-of-the-art performance and surpassing baselines on\naverage 15% on perception tasks like detection and tracking, 10% on occupancy\nprediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score\nand reduces the collision rate from 0.31% to only 0.12%.\n","authors":["Tengju Ye","Wei Jing","Chunyong Hu","Shikun Huang","Lingping Gao","Fangzhen Li","Jingke Wang","Ke Guo","Wencong Xiao","Weibo Mao","Hang Zheng","Kun Li","Junbo Chen","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01000v1","updated":"2023-08-02T08:20:00Z","published":"2023-08-02T08:20:00Z","title":"MDT3D: Multi-Dataset Training for LiDAR 3D Object Detection\n  Generalization","summary":"  Supervised 3D Object Detection models have been displaying increasingly\nbetter performance in single-domain cases where the training data comes from\nthe same environment and sensor as the testing data. However, in real-world\nscenarios data from the target domain may not be available for finetuning or\nfor domain adaptation methods. Indeed, 3D object detection models trained on a\nsource dataset with a specific point distribution have shown difficulties in\ngeneralizing to unseen datasets. Therefore, we decided to leverage the\ninformation available from several annotated source datasets with our\nMulti-Dataset Training for 3D Object Detection (MDT3D) method to increase the\nrobustness of 3D object detection models when tested in a new environment with\na different sensor configuration. To tackle the labelling gap between datasets,\nwe used a new label mapping based on coarse labels. Furthermore, we show how we\nmanaged the mix of datasets during training and finally introduce a new\ncross-dataset augmentation method: cross-dataset object injection. We\ndemonstrate that this training paradigm shows improvements for different types\nof 3D object detection models. The source code and additional results for this\nresearch project will be publicly available on GitHub for interested parties to\naccess and utilize: https://github.com/LouisSF/MDT3D\n","authors":["Louis Soum-Fontez","Jean-Emmanuel Deschaud","François Goulette"],"pdf_url":"https://arxiv.org/pdf/2308.01000v1.pdf","comment":"Accepted for publication at IROS 2023"},{"id":"http://arxiv.org/abs/2307.16125v2","updated":"2023-08-02T08:02:35Z","published":"2023-07-30T04:25:16Z","title":"SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension","summary":"  Based on powerful Large Language Models (LLMs), recent generative Multimodal\nLarge Language Models (MLLMs) have gained prominence as a pivotal research\narea, exhibiting remarkable capability for both comprehension and generation.\nIn this work, we address the evaluation of generative comprehension in MLLMs as\na preliminary step towards a comprehensive assessment of generative models, by\nintroducing a benchmark named SEED-Bench. SEED-Bench consists of 19K multiple\nchoice questions with accurate human annotations (x 6 larger than existing\nbenchmarks), which spans 12 evaluation dimensions including the comprehension\nof both the image and video modality. We develop an advanced pipeline for\ngenerating multiple-choice questions that target specific evaluation\ndimensions, integrating both automatic filtering and manual verification\nprocesses. Multiple-choice questions with groundtruth options derived from\nhuman annotation enables an objective and efficient assessment of model\nperformance, eliminating the need for human or GPT intervention during\nevaluation. We further evaluate the performance of 18 models across all 12\ndimensions, covering both the spatial and temporal understanding. By revealing\nthe limitations of existing MLLMs through evaluation results, we aim for\nSEED-Bench to provide insights for motivating future research. We will launch\nand consistently maintain a leaderboard to provide a platform for the community\nto assess and investigate model capability.\n","authors":["Bohao Li","Rui Wang","Guangzhi Wang","Yuying Ge","Yixiao Ge","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2307.16125v2.pdf","comment":"Technical Report; Project released at:\n  https://github.com/AILab-CVC/SEED-Bench"},{"id":"http://arxiv.org/abs/2308.00994v1","updated":"2023-08-02T07:59:25Z","published":"2023-08-02T07:59:25Z","title":"Exploiting Synthetic Data for Data Imbalance Problems: Baselines from a\n  Data Perspective","summary":"  We live in a vast ocean of data, and deep neural networks are no exception to\nthis. However, this data exhibits an inherent phenomenon of imbalance. This\nimbalance poses a risk of deep neural networks producing biased predictions,\nleading to potentially severe ethical and social consequences. To address these\nchallenges, we believe that the use of generative models is a promising\napproach for comprehending tasks, given the remarkable advancements\ndemonstrated by recent diffusion models in generating high-quality images. In\nthis work, we propose a simple yet effective baseline, SYNAuG, that utilizes\nsynthetic data as a preliminary step before employing task-specific algorithms\nto address data imbalance problems. This straightforward approach yields\nimpressive performance on datasets such as CIFAR100-LT, ImageNet100-LT,\nUTKFace, and Waterbird, surpassing the performance of existing task-specific\nmethods. While we do not claim that our approach serves as a complete solution\nto the problem of data imbalance, we argue that supplementing the existing data\nwith synthetic data proves to be an effective and crucial preliminary step in\naddressing data imbalance concerns.\n","authors":["Moon Ye-Bin","Nam Hyeon-Woo","Wonseok Choi","Nayeong Kim","Suha Kwak","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2308.00994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.10401v2","updated":"2023-08-02T07:58:00Z","published":"2021-04-21T08:13:17Z","title":"Multi-Attention-Based Soft Partition Network for Vehicle\n  Re-Identification","summary":"  Vehicle re-identification helps in distinguishing between images of the same\nand other vehicles. It is a challenging process because of significant\nintra-instance differences between identical vehicles from different views and\nsubtle inter-instance differences between similar vehicles. To solve this\nissue, researchers have extracted view-aware or part-specific features via\nspatial attention mechanisms, which usually result in noisy attention maps or\notherwise require expensive additional annotation for metadata, such as key\npoints, to improve the quality. Meanwhile, based on the researchers' insights,\nvarious handcrafted multi-attention architectures for specific viewpoints or\nvehicle parts have been proposed. However, this approach does not guarantee\nthat the number and nature of attention branches will be optimal for real-world\nre-identification tasks. To address these problems, we proposed a new vehicle\nre-identification network based on a multiple soft attention mechanism for\ncapturing various discriminative regions from different viewpoints more\nefficiently. Furthermore, this model can significantly reduce the noise in\nspatial attention maps by devising a new method for creating an attention map\nfor insignificant regions and then excluding it from generating the final\nresult. We also combined a channel-wise attention mechanism with a spatial\nattention mechanism for the efficient selection of important semantic\nattributes for vehicle re-identification. Our experiments showed that our\nproposed model achieved a state-of-the-art performance among the\nattention-based methods without metadata and was comparable to the approaches\nusing metadata for the VehicleID and VERI-Wild datasets.\n","authors":["Sangrok Lee","Taekang Woo","Sang Hun Lee"],"pdf_url":"https://arxiv.org/pdf/2104.10401v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.18948v2","updated":"2023-08-02T07:49:41Z","published":"2023-05-30T11:26:52Z","title":"Prompt-Based Tuning of Transformer Models for Multi-Center Medical Image\n  Segmentation of Head and Neck Cancer","summary":"  Medical image segmentation is a vital healthcare endeavor requiring precise\nand efficient models for appropriate diagnosis and treatment. Vision\ntransformer (ViT)-based segmentation models have shown great performance in\naccomplishing this task. However, to build a powerful backbone, the\nself-attention block of ViT requires large-scale pre-training data. The present\nmethod of modifying pre-trained models entails updating all or some of the\nbackbone parameters. This paper proposes a novel fine-tuning strategy for\nadapting a pretrained transformer-based segmentation model on data from a new\nmedical center. This method introduces a small number of learnable parameters,\ntermed prompts, into the input space (less than 1\\% of model parameters) while\nkeeping the rest of the model parameters frozen. Extensive studies employing\ndata from new unseen medical centers show that the prompt-based fine-tuning of\nmedical segmentation models provides excellent performance regarding the\nnew-center data with a negligible drop regarding the old centers. Additionally,\nour strategy delivers great accuracy with minimum re-training on new-center\ndata, significantly decreasing the computational and time costs of fine-tuning\npre-trained models.\n","authors":["Numan Saeed","Muhammad Ridzuan","Roba Al Majzoub","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2305.18948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.01769v2","updated":"2023-08-02T07:48:57Z","published":"2022-09-05T05:28:19Z","title":"B-CANF: Adaptive B-frame Coding with Conditional Augmented Normalizing\n  Flows","summary":"  Over the past few years, learning-based video compression has become an\nactive research area. However, most works focus on P-frame coding. Learned\nB-frame coding is under-explored and more challenging. This work introduces a\nnovel B-frame coding framework, termed B-CANF, that exploits conditional\naugmented normalizing flows for B-frame coding. B-CANF additionally features\ntwo novel elements: frame-type adaptive coding and B*-frames. Our frame-type\nadaptive coding learns better bit allocation for hierarchical B-frame coding by\ndynamically adapting the feature distributions according to the B-frame type.\nOur B*-frames allow greater flexibility in specifying the group-of-pictures\n(GOP) structure by reusing the B-frame codec to mimic P-frame coding, without\nthe need for an additional, separate P-frame codec. On commonly used datasets,\nB-CANF achieves the state-of-the-art compression performance as compared to the\nother learned B-frame codecs and shows comparable BD-rate results to HM-16.23\nunder the random access configuration in terms of PSNR. When evaluated on\ndifferent GOP structures, our B*-frames achieve similar performance to the\nadditional use of a separate P-frame codec.\n","authors":["Mu-Jung Chen","Yi-Hsin Chen","Wen-Hsiao Peng"],"pdf_url":"https://arxiv.org/pdf/2209.01769v2.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n  Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2308.00982v1","updated":"2023-08-02T07:32:32Z","published":"2023-08-02T07:32:32Z","title":"Orientation-Guided Contrastive Learning for UAV-View Geo-Localisation","summary":"  Retrieving relevant multimedia content is one of the main problems in a world\nthat is increasingly data-driven. With the proliferation of drones, high\nquality aerial footage is now available to a wide audience for the first time.\nIntegrating this footage into applications can enable GPS-less geo-localisation\nor location correction.\n  In this paper, we present an orientation-guided training framework for\nUAV-view geo-localisation. Through hierarchical localisation orientations of\nthe UAV images are estimated in relation to the satellite imagery. We propose a\nlightweight prediction module for these pseudo labels which predicts the\norientation between the different views based on the contrastive learned\nembeddings. We experimentally demonstrate that this prediction supports the\ntraining and outperforms previous approaches. The extracted pseudo-labels also\nenable aligned rotation of the satellite image as augmentation to further\nstrengthen the generalisation. During inference, we no longer need this\norientation module, which means that no additional computations are required.\nWe achieve state-of-the-art results on both the University-1652 and\nUniversity-160k datasets.\n","authors":["Fabian Deuser","Konrad Habel","Martin Werner","Norbert Oswald"],"pdf_url":"https://arxiv.org/pdf/2308.00982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10049v4","updated":"2023-08-02T07:28:01Z","published":"2023-03-17T15:23:15Z","title":"Uncertainty-informed Mutual Learning for Joint Medical Image\n  Classification and Segmentation","summary":"  Classification and segmentation are crucial in medical image analysis as they\nenable accurate diagnosis and disease monitoring. However, current methods\noften prioritize the mutual learning features and shared model parameters,\nwhile neglecting the reliability of features and performances. In this paper,\nwe propose a novel Uncertainty-informed Mutual Learning (UML) framework for\nreliable and interpretable medical image analysis. Our UML introduces\nreliability to joint classification and segmentation tasks, leveraging mutual\nlearning with uncertainty to improve performance. To achieve this, we first use\nevidential deep learning to provide image-level and pixel-wise confidences.\nThen, an Uncertainty Navigator Decoder is constructed for better using mutual\nfeatures and generating segmentation results. Besides, an Uncertainty\nInstructor is proposed to screen reliable masks for classification. Overall,\nUML could produce confidence estimation in features and performance for each\nlink (classification and segmentation). The experiments on the public datasets\ndemonstrate that our UML outperforms existing methods in terms of both accuracy\nand robustness. Our UML has the potential to explore the development of more\nreliable and explainable medical image analysis models. We will release the\ncodes for reproduction after acceptance.\n","authors":["Kai Ren","Ke Zou","Xianjie Liu","Yidi Chen","Xuedong Yuan","Xiaojing Shen","Meng Wang","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2303.10049v4.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.00964v1","updated":"2023-08-02T06:41:19Z","published":"2023-08-02T06:41:19Z","title":"ForensicsForest Family: A Series of Multi-scale Hierarchical Cascade\n  Forests for Detecting GAN-generated Faces","summary":"  The prominent progress in generative models has significantly improved the\nreality of generated faces, bringing serious concerns to society. Since recent\nGAN-generated faces are in high realism, the forgery traces have become more\nimperceptible, increasing the forensics challenge. To combat GAN-generated\nfaces, many countermeasures based on Convolutional Neural Networks (CNNs) have\nbeen spawned due to their strong learning ability. In this paper, we rethink\nthis problem and explore a new approach based on forest models instead of CNNs.\nSpecifically, we describe a simple and effective forest-based method set called\n{\\em ForensicsForest Family} to detect GAN-generate faces. The proposed\nForensicsForest family is composed of three variants, which are {\\em\nForensicsForest}, {\\em Hybrid ForensicsForest} and {\\em Divide-and-Conquer\nForensicsForest} respectively. ForenscisForest is a newly proposed Multi-scale\nHierarchical Cascade Forest, which takes semantic, frequency and biology\nfeatures as input, hierarchically cascades different levels of features for\nauthenticity prediction, and then employs a multi-scale ensemble scheme that\ncan comprehensively consider different levels of information to improve the\nperformance further. Based on ForensicsForest, we develop Hybrid\nForensicsForest, an extended version that integrates the CNN layers into\nmodels, to further refine the effectiveness of augmented features. Moreover, to\nreduce the memory cost in training, we propose Divide-and-Conquer\nForensicsForest, which can construct a forest model using only a portion of\ntraining samplings. In the training stage, we train several candidate forest\nmodels using the subsets of training samples. Then a ForensicsForest is\nassembled by picking the suitable components from these candidate forest\nmodels...\n","authors":["Jiucui Lu","Yuezun Li","Jiaran Zhou","Bin Li","Junyu Dong","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2308.00964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00956v1","updated":"2023-08-02T05:47:56Z","published":"2023-08-02T05:47:56Z","title":"Curriculum Guided Domain Adaptation in the Dark","summary":"  Addressing the rising concerns of privacy and security, domain adaptation in\nthe dark aims to adapt a black-box source trained model to an unlabeled target\ndomain without access to any source data or source model parameters. The need\nfor domain adaptation of black-box predictors becomes even more pronounced to\nprotect intellectual property as deep learning based solutions are becoming\nincreasingly commercialized. Current methods distill noisy predictions on the\ntarget data obtained from the source model to the target model, and/or separate\nclean/noisy target samples before adapting using traditional noisy label\nlearning algorithms. However, these methods do not utilize the easy-to-hard\nlearning nature of the clean/noisy data splits. Also, none of the existing\nmethods are end-to-end, and require a separate fine-tuning stage and an initial\nwarmup stage. In this work, we present Curriculum Adaptation for Black-Box\n(CABB) which provides a curriculum guided adaptation approach to gradually\ntrain the target model, first on target data with high confidence (clean)\nlabels, and later on target data with noisy labels. CABB utilizes\nJensen-Shannon divergence as a better criterion for clean-noisy sample\nseparation, compared to the traditional criterion of cross entropy loss. Our\nmethod utilizes co-training of a dual-branch network to suppress error\naccumulation resulting from confirmation bias. The proposed approach is\nend-to-end trainable and does not require any extra finetuning stage, unlike\nexisting methods. Empirical results on standard domain adaptation datasets show\nthat CABB outperforms existing state-of-the-art black-box DA models and is\ncomparable to white-box domain adaptation models.\n","authors":["Chowdhury Sadman Jahan","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2308.00956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00951v1","updated":"2023-08-02T05:20:55Z","published":"2023-08-02T05:20:55Z","title":"From Sparse to Soft Mixtures of Experts","summary":"  Sparse mixture of expert architectures (MoEs) scale model capacity without\nlarge increases in training or inference costs. Despite their success, MoEs\nsuffer from a number of issues: training instability, token dropping, inability\nto scale the number of experts, or ineffective finetuning. In this work, we\nproposeSoft MoE, a fully-differentiable sparse Transformer that addresses these\nchallenges, while maintaining the benefits of MoEs. Soft MoE performs an\nimplicit soft assignment by passing different weighted combinations of all\ninput tokens to each expert. As in other MoE works, experts in Soft MoE only\nprocess a subset of the (combined) tokens, enabling larger model capacity at\nlower inference cost. In the context of visual recognition, Soft MoE greatly\noutperforms standard Transformers (ViTs) and popular MoE variants (Tokens\nChoice and Experts Choice). For example, Soft MoE-Base/16 requires 10.5x lower\ninference cost (5.7x lower wall-clock time) than ViT-Huge/14 while matching its\nperformance after similar training. Soft MoE also scales well: Soft MoE Huge/14\nwith 128 experts in 16 MoE layers has over 40x more parameters than ViT\nHuge/14, while inference time cost grows by only 2%, and it performs\nsubstantially better.\n","authors":["Joan Puigcerver","Carlos Riquelme","Basil Mustafa","Neil Houlsby"],"pdf_url":"https://arxiv.org/pdf/2308.00951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00949v1","updated":"2023-08-02T05:13:02Z","published":"2023-08-02T05:13:02Z","title":"Training-Free Instance Segmentation from Semantic Image Segmentation\n  Masks","summary":"  In recent years, the development of instance segmentation has garnered\nsignificant attention in a wide range of applications. However, the training of\na fully-supervised instance segmentation model requires costly both\ninstance-level and pixel-level annotations. In contrast, weakly-supervised\ninstance segmentation methods (i.e., with image-level class labels or point\nlabels) struggle to satisfy the accuracy and recall requirements of practical\nscenarios. In this paper, we propose a novel paradigm for instance segmentation\ncalled training-free instance segmentation (TFISeg), which achieves instance\nsegmentation results from image masks predicted using off-the-shelf semantic\nsegmentation models. TFISeg does not require training a semantic or/and\ninstance segmentation model and avoids the need for instance-level image\nannotations. Therefore, it is highly efficient. Specifically, we first obtain a\nsemantic segmentation mask of the input image via a trained semantic\nsegmentation model. Then, we calculate a displacement field vector for each\npixel based on the segmentation mask, which can indicate representations\nbelonging to the same class but different instances, i.e., obtaining the\ninstance-level object information. Finally, instance segmentation results are\nobtained after being refined by a learnable category-agnostic object boundary\nbranch. Extensive experimental results on two challenging datasets and\nrepresentative semantic segmentation baselines (including CNNs and\nTransformers) demonstrate that TFISeg can achieve competitive results compared\nto the state-of-the-art fully-supervised instance segmentation methods without\nthe need for additional human resources or increased computational costs. The\ncode is available at: TFISeg\n","authors":["Yuchen Shen","Dong Zhang","Yuhui Zheng","Zechao Li","Liyong Fu","Qiaolin Ye"],"pdf_url":"https://arxiv.org/pdf/2308.00949v1.pdf","comment":"14 pages,5 figures"},{"id":"http://arxiv.org/abs/2010.08657v2","updated":"2023-08-02T05:07:57Z","published":"2020-10-16T22:40:28Z","title":"Class-incremental Learning with Pre-allocated Fixed Classifiers","summary":"  In class-incremental learning, a learning agent faces a stream of data with\nthe goal of learning new classes while not forgetting previous ones. Neural\nnetworks are known to suffer under this setting, as they forget previously\nacquired knowledge. To address this problem, effective methods exploit past\ndata stored in an episodic memory while expanding the final classifier nodes to\naccommodate the new classes.\n  In this work, we substitute the expanding classifier with a novel fixed\nclassifier in which a number of pre-allocated output nodes are subject to the\nclassification loss right from the beginning of the learning phase. Contrarily\nto the standard expanding classifier, this allows: (a) the output nodes of\nfuture unseen classes to firstly see negative samples since the beginning of\nlearning together with the positive samples that incrementally arrive; (b) to\nlearn features that do not change their geometric configuration as novel\nclasses are incorporated in the learning model.\n  Experiments with public datasets show that the proposed approach is as\neffective as the expanding classifier while exhibiting novel intriguing\nproperties of the internal feature representation that are otherwise\nnot-existent. Our ablation study on pre-allocating a large number of classes\nfurther validates the approach.\n","authors":["Federico Pernici","Matteo Bruni","Claudio Baecchi","Francesco Turchini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2010.08657v2.pdf","comment":"ICPR 2021 (figure fixed)"},{"id":"http://arxiv.org/abs/2308.00947v1","updated":"2023-08-02T05:02:30Z","published":"2023-08-02T05:02:30Z","title":"Decomposing and Coupling Saliency Map for Lesion Segmentation in\n  Ultrasound Images","summary":"  Complex scenario of ultrasound image, in which adjacent tissues (i.e.,\nbackground) share similar intensity with and even contain richer texture\npatterns than lesion region (i.e., foreground), brings a unique challenge for\naccurate lesion segmentation. This work presents a decomposition-coupling\nnetwork, called DC-Net, to deal with this challenge in a\n(foreground-background) saliency map disentanglement-fusion manner. The DC-Net\nconsists of decomposition and coupling subnets, and the former preliminarily\ndisentangles original image into foreground and background saliency maps,\nfollowed by the latter for accurate segmentation under the assistance of\nsaliency prior fusion. The coupling subnet involves three aspects of fusion\nstrategies, including: 1) regional feature aggregation (via differentiable\ncontext pooling operator in the encoder) to adaptively preserve local\ncontextual details with the larger receptive field during dimension reduction;\n2) relation-aware representation fusion (via cross-correlation fusion module in\nthe decoder) to efficiently fuse low-level visual characteristics and\nhigh-level semantic features during resolution restoration; 3) dependency-aware\nprior incorporation (via coupler) to reinforce foreground-salient\nrepresentation with the complementary information derived from background\nrepresentation. Furthermore, a harmonic loss function is introduced to\nencourage the network to focus more attention on low-confidence and hard\nsamples. The proposed method is evaluated on two ultrasound lesion segmentation\ntasks, which demonstrates the remarkable performance improvement over existing\nstate-of-the-art methods.\n","authors":["Zhenyuan Ning","Yixiao Mao","Qianjin Feng","Shengzhou Zhong","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00947v1.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2308.00931v1","updated":"2023-08-02T04:17:35Z","published":"2023-08-02T04:17:35Z","title":"WaterFlow: Heuristic Normalizing Flow for Underwater Image Enhancement\n  and Beyond","summary":"  Underwater images suffer from light refraction and absorption, which impairs\nvisibility and interferes the subsequent applications. Existing underwater\nimage enhancement methods mainly focus on image quality improvement, ignoring\nthe effect on practice. To balance the visual quality and application, we\npropose a heuristic normalizing flow for detection-driven underwater image\nenhancement, dubbed WaterFlow. Specifically, we first develop an invertible\nmapping to achieve the translation between the degraded image and its clear\ncounterpart. Considering the differentiability and interpretability, we\nincorporate the heuristic prior into the data-driven mapping procedure, where\nthe ambient light and medium transmission coefficient benefit credible\ngeneration. Furthermore, we introduce a detection perception module to transmit\nthe implicit semantic guidance into the enhancement procedure, where the\nenhanced images hold more detection-favorable features and are able to promote\nthe detection performance. Extensive experiments prove the superiority of our\nWaterFlow, against state-of-the-art methods quantitatively and qualitatively.\n","authors":["Zengxi Zhang","Zhiying Jiang","Jinyuan Liu","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.00931v1.pdf","comment":"10 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.00929v1","updated":"2023-08-02T04:10:14Z","published":"2023-08-02T04:10:14Z","title":"Towards Discriminative Representation with Meta-learning for\n  Colonoscopic Polyp Re-Identification","summary":"  Colonoscopic Polyp Re-Identification aims to match the same polyp from a\nlarge gallery with images from different views taken using different cameras\nand plays an important role in the prevention and treatment of colorectal\ncancer in computer-aided diagnosis. However, traditional methods for object\nReID directly adopting CNN models trained on the ImageNet dataset usually\nproduce unsatisfactory retrieval performance on colonoscopic datasets due to\nthe large domain gap. Additionally, these methods neglect to explore the\npotential of self-discrepancy among intra-class relations in the colonoscopic\npolyp dataset, which remains an open research problem in the medical community.\nTo solve this dilemma, we propose a simple but effective training method named\nColo-ReID, which can help our model to learn more general and discriminative\nknowledge based on the meta-learning strategy in scenarios with fewer samples.\nBased on this, a dynamic Meta-Learning Regulation mechanism called MLR is\nintroduced to further boost the performance of polyp re-identification. To the\nbest of our knowledge, this is the first attempt to leverage the meta-learning\nparadigm instead of traditional machine learning to effectively train deep\nmodels in the task of colonoscopic polyp re-identification. Empirical results\nshow that our method significantly outperforms current state-of-the-art methods\nby a clear margin.\n","authors":["Suncheng Xiang","Qingzhong Chen","Shilun Cai","Chengfeng Zhou","Crystal Cai","Sijia Du","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2308.00929v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.10625"},{"id":"http://arxiv.org/abs/2307.10636v2","updated":"2023-08-02T04:08:47Z","published":"2023-07-20T07:04:16Z","title":"Learning and Evaluating Human Preferences for Conversational Head\n  Generation","summary":"  A reliable and comprehensive evaluation metric that aligns with manual\npreference assessments is crucial for conversational head video synthesis\nmethods development. Existing quantitative evaluations often fail to capture\nthe full complexity of human preference, as they only consider limited\nevaluation dimensions. Qualitative evaluations and user studies offer a\nsolution but are time-consuming and labor-intensive. This limitation hinders\nthe advancement of conversational head generation algorithms and systems. In\nthis paper, we propose a novel learning-based evaluation metric named\nPreference Score (PS) for fitting human preference according to the\nquantitative evaluations across different dimensions. PS can serve as a\nquantitative evaluation without the need for human annotation. Experimental\nresults validate the superiority of Preference Score in aligning with human\nperception, and also demonstrate robustness and generalizability to unseen\ndata, making it a valuable tool for advancing conversation head generation. We\nexpect this metric could facilitate new advances in conversational head\ngeneration. Project Page: https://https://github.com/dc3ea9f/PreferenceScore.\n","authors":["Mohan Zhou","Yalong Bai","Wei Zhang","Ting Yao","Tiejun Zhao","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2307.10636v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.00926v1","updated":"2023-08-02T04:02:46Z","published":"2023-08-02T04:02:46Z","title":"Detection and Segmentation of Cosmic Objects Based on Adaptive\n  Thresholding and Back Propagation Neural Network","summary":"  Astronomical images provide information about the great variety of cosmic\nobjects in the Universe. Due to the large volumes of data, the presence of\ninnumerable bright point sources as well as noise within the frame and the\nspatial gap between objects and satellite cameras, it is a challenging task to\nclassify and detect the celestial objects. We propose an Adaptive Thresholding\nMethod (ATM) based segmentation and Back Propagation Neural Network (BPNN)\nbased cosmic object detection including a well-structured series of\npre-processing steps designed to enhance segmentation and detection.\n","authors":["Samia Sultana","Shyla Afroge"],"pdf_url":"https://arxiv.org/pdf/2308.00926v1.pdf","comment":"4 pages, 7 figures, Bachelor Thesis, Rajshahi University of\n  Engineering and Technology"},{"id":"http://arxiv.org/abs/2308.00924v1","updated":"2023-08-02T03:47:19Z","published":"2023-08-02T03:47:19Z","title":"Continual Domain Adaptation on Aerial Images under Gradually Degrading\n  Weather","summary":"  Domain adaptation (DA) strives to mitigate the domain gap between the source\ndomain where a model is trained, and the target domain where the model is\ndeployed. When a deep learning model is deployed on an aerial platform, it may\nface gradually degrading weather conditions during operation, leading to\nwidening domain gaps between the training data and the encountered evaluation\ndata. We synthesize two such gradually worsening weather conditions on real\nimages from two existing aerial imagery datasets, generating a total of four\nbenchmark datasets. Under the continual, or test-time adaptation setting, we\nevaluate three DA models on our datasets: a baseline standard DA model and two\ncontinual DA models. In such setting, the models can access only one small\nportion, or one batch of the target data at a time, and adaptation takes place\ncontinually, and over only one epoch of the data. The combination of the\nconstraints of continual adaptation, and gradually deteriorating weather\nconditions provide the practical DA scenario for aerial deployment. Among the\nevaluated models, we consider both convolutional and transformer architectures\nfor comparison. We discover stability issues during adaptation for existing\nbuffer-fed continual DA methods, and offer gradient normalization as a simple\nsolution to curb training instability.\n","authors":["Chowdhury Sadman Jahan","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2308.00924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00920v1","updated":"2023-08-02T03:31:22Z","published":"2023-08-02T03:31:22Z","title":"Virtual histological staining of unlabeled autopsy tissue","summary":"  Histological examination is a crucial step in an autopsy; however, the\ntraditional histochemical staining of post-mortem samples faces multiple\nchallenges, including the inferior staining quality due to autolysis caused by\ndelayed fixation of cadaver tissue, as well as the resource-intensive nature of\nchemical staining procedures covering large tissue areas, which demand\nsubstantial labor, cost, and time. These challenges can become more pronounced\nduring global health crises when the availability of histopathology services is\nlimited, resulting in further delays in tissue fixation and more severe\nstaining artifacts. Here, we report the first demonstration of virtual staining\nof autopsy tissue and show that a trained neural network can rapidly transform\nautofluorescence images of label-free autopsy tissue sections into brightfield\nequivalent images that match hematoxylin and eosin (H&E) stained versions of\nthe same samples, eliminating autolysis-induced severe staining artifacts\ninherent in traditional histochemical staining of autopsied tissue. Our virtual\nH&E model was trained using >0.7 TB of image data and a data-efficient\ncollaboration scheme that integrates the virtual staining network with an image\nregistration network. The trained model effectively accentuated nuclear,\ncytoplasmic and extracellular features in new autopsy tissue samples that\nexperienced severe autolysis, such as COVID-19 samples never seen before, where\nthe traditional histochemical staining failed to provide consistent staining\nquality. This virtual autopsy staining technique can also be extended to\nnecrotic tissue, and can rapidly and cost-effectively generate artifact-free\nH&E stains despite severe autolysis and cell death, also reducing labor, cost\nand infrastructure requirements associated with the standard histochemical\nstaining.\n","authors":["Yuzhu Li","Nir Pillar","Jingxi Li","Tairan Liu","Di Wu","Songyu Sun","Guangdong Ma","Kevin de Haan","Luzhe Huang","Sepehr Hamidi","Anatoly Urisman","Tal Keidar Haran","William Dean Wallace","Jonathan E. Zuckerman","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2308.00920v1.pdf","comment":"24 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2307.10784v2","updated":"2023-08-02T03:24:36Z","published":"2023-07-20T11:33:46Z","title":"SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with\n  4D Imaging Radar","summary":"  The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle\nsensing due to its cost-effectiveness and operability in adverse weather\nconditions. However, the adoption of this technology has been hindered by\nsparsity and noise issues in radar point cloud data. This paper introduces\nspatial multi-representation fusion (SMURF), a novel approach to 3D object\ndetection using a single 4D imaging radar. SMURF leverages multiple\nrepresentations of radar detection points, including pillarization and density\nfeatures of a multi-dimensional Gaussian mixture distribution through kernel\ndensity estimation (KDE). KDE effectively mitigates measurement inaccuracy\ncaused by limited angular resolution and multi-path propagation of radar\nsignals. Additionally, KDE helps alleviate point cloud sparsity by capturing\ndensity features. Experimental evaluations on View-of-Delft (VoD) and\nTJ4DRadSet datasets demonstrate the effectiveness and generalization ability of\nSMURF, outperforming recently proposed 4D imaging radar-based\nsingle-representation models. Moreover, while using 4D imaging radar only,\nSMURF still achieves comparable performance to the state-of-the-art 4D imaging\nradar and camera fusion-based method, with an increase of 1.22% in the mean\naverage precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D\nmean average precision on the entire annotated area of VoD dataset. Our\nproposed method demonstrates impressive inference time and addresses the\nchallenges of real-time detection, with the inference time no more than 0.05\nseconds for most scans on both datasets. This research highlights the benefits\nof 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D\nobject detection with 4D imaging radar.\n","authors":["Jianan Liu","Qiuchi Zhao","Weiyi Xiong","Tao Huang","Qing-Long Han","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.10784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00918v1","updated":"2023-08-02T03:16:12Z","published":"2023-08-02T03:16:12Z","title":"A Novel Cross-Perturbation for Single Domain Generalization","summary":"  Single domain generalization aims to enhance the ability of the model to\ngeneralize to unknown domains when trained on a single source domain. However,\nthe limited diversity in the training data hampers the learning of\ndomain-invariant features, resulting in compromised generalization performance.\nTo address this, data perturbation (augmentation) has emerged as a crucial\nmethod to increase data diversity. Nevertheless, existing perturbation methods\noften focus on either image-level or feature-level perturbations independently,\nneglecting their synergistic effects. To overcome these limitations, we propose\nCPerb, a simple yet effective cross-perturbation method. Specifically, CPerb\nutilizes both horizontal and vertical operations. Horizontally, it applies\nimage-level and feature-level perturbations to enhance the diversity of the\ntraining data, mitigating the issue of limited diversity in single-source\ndomains. Vertically, it introduces multi-route perturbation to learn\ndomain-invariant features from different perspectives of samples with the same\nsemantic category, thereby enhancing the generalization capability of the\nmodel. Additionally, we propose MixPatch, a novel feature-level perturbation\nmethod that exploits local image style information to further diversify the\ntraining data. Extensive experiments on various benchmark datasets validate the\neffectiveness of our method.\n","authors":["Dongjia Zhao","Lei Qi","Xiao Shi","Yinghuan Shi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2308.00918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00906v1","updated":"2023-08-02T01:57:11Z","published":"2023-08-02T01:57:11Z","title":"ImageBrush: Learning Visual In-Context Instructions for Exemplar-Based\n  Image Manipulation","summary":"  While language-guided image manipulation has made remarkable progress, the\nchallenge of how to instruct the manipulation process faithfully reflecting\nhuman intentions persists. An accurate and comprehensive description of a\nmanipulation task using natural language is laborious and sometimes even\nimpossible, primarily due to the inherent uncertainty and ambiguity present in\nlinguistic expressions. Is it feasible to accomplish image manipulation without\nresorting to external cross-modal language information? If this possibility\nexists, the inherent modality gap would be effortlessly eliminated. In this\npaper, we propose a novel manipulation methodology, dubbed ImageBrush, that\nlearns visual instructions for more accurate image editing. Our key idea is to\nemploy a pair of transformation images as visual instructions, which not only\nprecisely captures human intention but also facilitates accessibility in\nreal-world scenarios. Capturing visual instructions is particularly challenging\nbecause it involves extracting the underlying intentions solely from visual\ndemonstrations and then applying this operation to a new image. To address this\nchallenge, we formulate visual instruction learning as a diffusion-based\ninpainting problem, where the contextual information is fully exploited through\nan iterative process of generation. A visual prompting encoder is carefully\ndevised to enhance the model's capacity in uncovering human intent behind the\nvisual instructions. Extensive experiments show that our method generates\nengaging manipulation results conforming to the transformations entailed in\ndemonstrations. Moreover, our model exhibits robust generalization capabilities\non various downstream tasks such as pose transfer, image translation and video\ninpainting.\n","authors":["Yasheng Sun","Yifan Yang","Houwen Peng","Yifei Shen","Yuqing Yang","Han Hu","Lili Qiu","Hideki Koike"],"pdf_url":"https://arxiv.org/pdf/2308.00906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01477v1","updated":"2023-08-02T23:59:59Z","published":"2023-08-02T23:59:59Z","title":"HANDAL: A Dataset of Real-World Manipulable Object Categories with Pose\n  Annotations, Affordances, and Reconstructions","summary":"  We present the HANDAL dataset for category-level object pose estimation and\naffordance prediction. Unlike previous datasets, ours is focused on\nrobotics-ready manipulable objects that are of the proper size and shape for\nfunctional grasping by robot manipulators, such as pliers, utensils, and\nscrewdrivers. Our annotation process is streamlined, requiring only a single\noff-the-shelf camera and semi-automated processing, allowing us to produce\nhigh-quality 3D annotations without crowd-sourcing. The dataset consists of\n308k annotated image frames from 2.2k videos of 212 real-world objects in 17\ncategories. We focus on hardware and kitchen tool objects to facilitate\nresearch in practical scenarios in which a robot manipulator needs to interact\nwith the environment beyond simple pushing or indiscriminate grasping. We\noutline the usefulness of our dataset for 6-DoF category-level pose+scale\nestimation and related tasks. We also provide 3D reconstructed meshes of all\nobjects, and we outline some of the bottlenecks to be addressed for\ndemocratizing the collection of datasets like this one.\n","authors":["Andrew Guo","Bowen Wen","Jianhe Yuan","Jonathan Tremblay","Stephen Tyree","Jeffrey Smith","Stan Birchfield"],"pdf_url":"https://arxiv.org/pdf/2308.01477v1.pdf","comment":"IROS 2023. Project page: https://nvlabs.github.io/HANDAL/"},{"id":"http://arxiv.org/abs/2308.01472v1","updated":"2023-08-02T23:39:29Z","published":"2023-08-02T23:39:29Z","title":"Reverse Stable Diffusion: What prompt was used to generate this image?","summary":"  Text-to-image diffusion models such as Stable Diffusion have recently\nattracted the interest of many researchers, and inverting the diffusion process\ncan play an important role in better understanding the generative process and\nhow to engineer prompts in order to obtain the desired images. To this end, we\nintroduce the new task of predicting the text prompt given an image generated\nby a generative diffusion model. We combine a series of white-box and black-box\nmodels (with and without access to the weights of the diffusion network) to\ndeal with the proposed task. We propose a novel learning framework comprising\nof a joint prompt regression and multi-label vocabulary classification\nobjective that generates improved prompts. To further improve our method, we\nemploy a curriculum learning procedure that promotes the learning of\nimage-prompt pairs with lower labeling noise (i.e. that are better aligned),\nand an unsupervised domain-adaptive kernel learning method that uses the\nsimilarities between samples in the source and target domains as extra\nfeatures. We conduct experiments on the DiffusionDB data set, predicting text\nprompts from images generated by Stable Diffusion. Our novel learning framework\nproduces excellent results on the aforementioned task, yielding the highest\ngains when applied on the white-box model. In addition, we make an interesting\ndiscovery: training a diffusion model on the prompt generation task can make\nthe model generate images that are much better aligned with the input prompts,\nwhen the model is directly reused for text-to-image generation.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.01472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01471v1","updated":"2023-08-02T23:39:24Z","published":"2023-08-02T23:39:24Z","title":"Implicit Occupancy Flow Fields for Perception and Prediction in\n  Self-Driving","summary":"  A self-driving vehicle (SDV) must be able to perceive its surroundings and\npredict the future behavior of other traffic participants. Existing works\neither perform object detection followed by trajectory forecasting of the\ndetected objects, or predict dense occupancy and flow grids for the whole\nscene. The former poses a safety concern as the number of detections needs to\nbe kept low for efficiency reasons, sacrificing object recall. The latter is\ncomputationally expensive due to the high-dimensionality of the output grid,\nand suffers from the limited receptive field inherent to fully convolutional\nnetworks. Furthermore, both approaches employ many computational resources\npredicting areas or objects that might never be queried by the motion planner.\nThis motivates our unified approach to perception and future prediction that\nimplicitly represents occupancy and flow over time with a single neural\nnetwork. Our method avoids unnecessary computation, as it can be directly\nqueried by the motion planner at continuous spatio-temporal locations.\nMoreover, we design an architecture that overcomes the limited receptive field\nof previous explicit occupancy prediction methods by adding an efficient yet\neffective global attention mechanism. Through extensive experiments in both\nurban and highway settings, we demonstrate that our implicit model outperforms\nthe current state-of-the-art. For more information, visit the project website:\nhttps://waabi.ai/research/implicito.\n","authors":["Ben Agro","Quinlan Sykora","Sergio Casas","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2308.01471v1.pdf","comment":"19 pages, 13 figures"},{"id":"http://arxiv.org/abs/2307.04577v2","updated":"2023-08-02T22:14:06Z","published":"2023-07-10T14:11:07Z","title":"AnyTeleop: A General Vision-Based Dexterous Robot Arm-Hand Teleoperation\n  System","summary":"  Vision-based teleoperation offers the possibility to endow robots with\nhuman-level intelligence to physically interact with the environment, while\nonly requiring low-cost camera sensors. However, current vision-based\nteleoperation systems are designed and engineered towards a particular robot\nmodel and deploy environment, which scales poorly as the pool of the robot\nmodels expands and the variety of the operating environment increases. In this\npaper, we propose AnyTeleop, a unified and general teleoperation system to\nsupport multiple different arms, hands, realities, and camera configurations\nwithin a single system. Although being designed to provide great flexibility to\nthe choice of simulators and real hardware, our system can still achieve great\nperformance. For real-world experiments, AnyTeleop can outperform a previous\nsystem that was designed for a specific robot hardware with a higher success\nrate, using the same robot. For teleoperation in simulation, AnyTeleop leads to\nbetter imitation learning performance, compared with a previous system that is\nparticularly designed for that simulator. Project page: http://anyteleop.com/.\n","authors":["Yuzhe Qin","Wei Yang","Binghao Huang","Karl Van Wyk","Hao Su","Xiaolong Wang","Yu-Wei Chao","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2307.04577v2.pdf","comment":"http://anyteleop.com/ Robotics: Science and Systems 2023"},{"id":"http://arxiv.org/abs/2308.01433v1","updated":"2023-08-02T21:13:10Z","published":"2023-08-02T21:13:10Z","title":"COVID-VR: A Deep Learning COVID-19 Classification Model Using\n  Volume-Rendered Computer Tomography","summary":"  The COVID-19 pandemic presented numerous challenges to healthcare systems\nworldwide. Given that lung infections are prevalent among COVID-19 patients,\nchest Computer Tomography (CT) scans have frequently been utilized as an\nalternative method for identifying COVID-19 conditions and various other types\nof pulmonary diseases. Deep learning architectures have emerged to automate the\nidentification of pulmonary disease types by leveraging CT scan slices as\ninputs for classification models. This paper introduces COVID-VR, a novel\napproach for classifying pulmonary diseases based on volume rendering images of\nthe lungs captured from multiple angles, thereby providing a comprehensive view\nof the entire lung in each image. To assess the effectiveness of our proposal,\nwe compared it against competing strategies utilizing both private data\nobtained from partner hospitals and a publicly available dataset. The results\ndemonstrate that our approach effectively identifies pulmonary lesions and\nperforms competitively when compared to slice-based methods.\n","authors":["Noemi Maritza L. Romero","Ricco Vasconcellos","Mariana R. Mendoza","João L. D. Comba"],"pdf_url":"https://arxiv.org/pdf/2308.01433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14579v2","updated":"2023-08-02T21:00:48Z","published":"2023-05-23T23:35:43Z","title":"Real-Time Idling Vehicles Detection using Combined Audio-Visual Deep\n  Learning","summary":"  Combustion vehicle emissions contribute to poor air quality and release\ngreenhouse gases into the atmosphere, and vehicle pollution has been associated\nwith numerous adverse health effects. Roadways with extensive waiting and/or\npassenger drop off, such as schools and hospital drop-off zones, can result in\nhigh incidence and density of idling vehicles. This can produce micro-climates\nof increased vehicle pollution. Thus, the detection of idling vehicles can be\nhelpful in monitoring and responding to unnecessary idling and be integrated\ninto real-time or off-line systems to address the resulting pollution. In this\npaper we present a real-time, dynamic vehicle idling detection algorithm. The\nproposed idle detection algorithm and notification rely on an algorithm to\ndetect these idling vehicles. The proposed method relies on a multi-sensor,\naudio-visual, machine-learning workflow to detect idling vehicles visually\nunder three conditions: moving, static with the engine on, and static with the\nengine off. The visual vehicle motion detector is built in the first stage, and\nthen a contrastive-learning-based latent space is trained for classifying\nstatic vehicle engine sound. We test our system in real-time at a hospital\ndrop-off point in Salt Lake City. This in-situ dataset was collected and\nannotated, and it includes vehicles of varying models and types. The\nexperiments show that the method can detect engine switching on or off\ninstantly and achieves 71.02 average precision (AP) for idle detections and\n91.06 for engine off detections.\n","authors":["Xiwen Li","Tristalee Mangin","Surojit Saha","Evan Blanchard","Dillon Tang","Henry Poppe","Nathan Searle","Ouk Choi","Kerry Kelly","Ross Whitaker"],"pdf_url":"https://arxiv.org/pdf/2305.14579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01424v1","updated":"2023-08-02T20:46:43Z","published":"2023-08-02T20:46:43Z","title":"LiDAR View Synthesis for Robust Vehicle Navigation Without Expert Labels","summary":"  Deep learning models for self-driving cars require a diverse training dataset\nto safely manage critical driving scenarios on public roads. This includes\nhaving data from divergent trajectories such as the oncoming traffic lane or\nsidewalks. Such data would be too dangerous to collect in the real world. Data\naugmentation approaches have been proposed to tackle this issue using RGB\nimages. However, solutions based on LiDAR sensors are scarce. We therefore\npropose an approach to synthesize additional LiDAR point clouds from novel\nviewpoints without having the need to physically drive at dangerous positions.\nThe LiDAR view synthesis is done using mesh reconstruction and ray casting. We\ntrain a deep learning model, which takes a LiDAR scan as input and predicts the\nfuture trajectory as output. A waypoint controller is then applied on this\npredicted trajectory to determine the throttle and steering labels of the\nego-vehicle. Our method neither requires expert driving labels for the original\nnor for the synthesized LiDAR sequence. Instead, we infer labels from LiDAR\nodometry. We demonstrate the effectiveness of our approach in a comprehensive\nonline evaluation and with a comparison to concurrent work. Our results show\nthe importance of synthesizing additional LiDAR point clouds, particularly in\nterms of model robustness. Code and supplementary visualizations are available\nat https://jonathsch.github.io/lidar-synthesis/ .\n","authors":["Jonathan Schmidt","Qadeer Khan","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2308.01424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14701v2","updated":"2023-08-02T20:20:58Z","published":"2023-07-27T08:43:34Z","title":"MIM-OOD: Generative Masked Image Modelling for Out-of-Distribution\n  Detection in Medical Images","summary":"  Unsupervised Out-of-Distribution (OOD) detection consists in identifying\nanomalous regions in images leveraging only models trained on images of healthy\nanatomy. An established approach is to tokenize images and model the\ndistribution of tokens with Auto-Regressive (AR) models. AR models are used to\n1) identify anomalous tokens and 2) in-paint anomalous representations with\nin-distribution tokens. However, AR models are slow at inference time and prone\nto error accumulation issues which negatively affect OOD detection performance.\nOur novel method, MIM-OOD, overcomes both speed and error accumulation issues\nby replacing the AR model with two task-specific networks: 1) a transformer\noptimized to identify anomalous tokens and 2) a transformer optimized to\nin-paint anomalous tokens using masked image modelling (MIM). Our experiments\nwith brain MRI anomalies show that MIM-OOD substantially outperforms AR models\n(DICE 0.458 vs 0.301) while achieving a nearly 25x speedup (9.5s vs 244s).\n","authors":["Sergio Naval Marimont","Vasilis Siomos","Giacomo Tarroni"],"pdf_url":"https://arxiv.org/pdf/2307.14701v2.pdf","comment":"12 pages, 5 figures. Accepted in DGM4MICCAI workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.01412v1","updated":"2023-08-02T20:16:13Z","published":"2023-08-02T20:16:13Z","title":"Harder synthetic anomalies to improve OoD detection in Medical Images","summary":"  Our method builds upon previous Medical Out-of-Distribution (MOOD) challenge\nwinners that empirically show that synthetic local anomalies generated copying\n/ interpolating foreign patches are useful to train segmentation networks able\nto generalize to unseen types of anomalies. In terms of the synthetic anomaly\ngeneration process, our contributions makes synthetic anomalies more\nheterogeneous and challenging by 1) using random shapes instead of squares and\n2) smoothing the interpolation edge of anomalies so networks cannot rely on the\nhigh gradient between image - foreign patch to identify anomalies. Our\nexperiments using the validation set of 2020 MOOD winners show that both\ncontributions improved substantially the method performance. We used a standard\n3D U-Net architecture as segmentation network, trained patch-wise in both brain\nand abdominal datasets. Our final challenge submission consisted of 10 U-Nets\ntrained across 5 data folds with different configurations of the anomaly\ngeneration process. Our method achieved first position in both sample-wise and\npixel-wise tasks in the 2022 edition of the Medical Out-of-Distribution held at\nMICCAI.\n","authors":["Sergio Naval Marimont","Giacomo Tarroni"],"pdf_url":"https://arxiv.org/pdf/2308.01412v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.01390v1","updated":"2023-08-02T19:10:23Z","published":"2023-08-02T19:10:23Z","title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive\n  Vision-Language Models","summary":"  We introduce OpenFlamingo, a family of autoregressive vision-language models\nranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce\nan open-source replication of DeepMind's Flamingo models. On seven\nvision-language datasets, OpenFlamingo models average between 80 - 89% of\ncorresponding Flamingo performance. This technical report describes our models,\ntraining data, hyperparameters, and evaluation suite. We share our models and\ncode at https://github.com/mlfoundations/open_flamingo.\n","authors":["Anas Awadalla","Irena Gao","Josh Gardner","Jack Hessel","Yusuf Hanafy","Wanrong Zhu","Kalyani Marathe","Yonatan Bitton","Samir Gadre","Shiori Sagawa","Jenia Jitsev","Simon Kornblith","Pang Wei Koh","Gabriel Ilharco","Mitchell Wortsman","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.01390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01389v1","updated":"2023-08-02T19:08:57Z","published":"2023-08-02T19:08:57Z","title":"Follow the Soldiers with Optimized Single-Shot Multibox Detection and\n  Reinforcement Learning","summary":"  Nowadays, autonomous cars are gaining traction due to their numerous\npotential applications on battlefields and in resolving a variety of other\nreal-world challenges. The main goal of our project is to build an autonomous\nsystem using DeepRacer which will follow a specific person (for our project, a\nsoldier) when they will be moving in any direction. Two main components to\naccomplish this project is an optimized Single-Shot Multibox Detection (SSD)\nobject detection model and a Reinforcement Learning (RL) model. We accomplished\nthe task using SSD Lite instead of SSD and at the end, compared the results\namong SSD, SSD with Neural Computing Stick (NCS), and SSD Lite. Experimental\nresults show that SSD Lite gives better performance among these three\ntechniques and exhibits a considerable boost in inference speed (~2-3 times)\nwithout compromising accuracy.\n","authors":["Jumman Hossain","Maliha Momtaz"],"pdf_url":"https://arxiv.org/pdf/2308.01389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01379v1","updated":"2023-08-02T18:36:54Z","published":"2023-08-02T18:36:54Z","title":"Computational Long Exposure Mobile Photography","summary":"  Long exposure photography produces stunning imagery, representing moving\nelements in a scene with motion-blur. It is generally employed in two\nmodalities, producing either a foreground or a background blur effect.\nForeground blur images are traditionally captured on a tripod-mounted camera\nand portray blurred moving foreground elements, such as silky water or light\ntrails, over a perfectly sharp background landscape. Background blur images,\nalso called panning photography, are captured while the camera is tracking a\nmoving subject, to produce an image of a sharp subject over a background\nblurred by relative motion. Both techniques are notoriously challenging and\nrequire additional equipment and advanced skills. In this paper, we describe a\ncomputational burst photography system that operates in a hand-held smartphone\ncamera app, and achieves these effects fully automatically, at the tap of the\nshutter button. Our approach first detects and segments the salient subject. We\ntrack the scene motion over multiple frames and align the images in order to\npreserve desired sharpness and to produce aesthetically pleasing motion\nstreaks. We capture an under-exposed burst and select the subset of input\nframes that will produce blur trails of controlled length, regardless of scene\nor camera motion velocity. We predict inter-frame motion and synthesize\nmotion-blur to fill the temporal gaps between the input frames. Finally, we\ncomposite the blurred image with the sharp regular exposure to protect the\nsharpness of faces or areas of the scene that are barely moving, and produce a\nfinal high resolution and high dynamic range (HDR) photograph. Our system\ndemocratizes a capability previously reserved to professionals, and makes this\ncreative style accessible to most casual photographers.\n  More information and supplementary material can be found on our project\nwebpage: https://motion-mode.github.io/\n","authors":["Eric Tabellion","Nikhil Karnad","Noa Glaser","Ben Weiss","David E. Jacobs","Yael Pritch"],"pdf_url":"https://arxiv.org/pdf/2308.01379v1.pdf","comment":"15 pages, 17 figures"},{"id":"http://arxiv.org/abs/2308.01328v1","updated":"2023-08-02T17:05:36Z","published":"2023-08-02T17:05:36Z","title":"A vision transformer-based framework for knowledge transfer from\n  multi-modal to mono-modal lymphoma subtyping models","summary":"  Determining lymphoma subtypes is a crucial step for better patients treatment\ntargeting to potentially increase their survival chances. In this context, the\nexisting gold standard diagnosis method, which is based on gene expression\ntechnology, is highly expensive and time-consuming making difficult its\naccessibility. Although alternative diagnosis methods based on IHC\n(immunohistochemistry) technologies exist (recommended by the WHO), they still\nsuffer from similar limitations and are less accurate. WSI (Whole Slide Image)\nanalysis by deep learning models showed promising new directions for cancer\ndiagnosis that would be cheaper and faster than existing alternative methods.\nIn this work, we propose a vision transformer-based framework for\ndistinguishing DLBCL (Diffuse Large B-Cell Lymphoma) cancer subtypes from\nhigh-resolution WSIs. To this end, we propose a multi-modal architecture to\ntrain a classifier model from various WSI modalities. We then exploit this\nmodel through a knowledge distillation mechanism for efficiently driving the\nlearning of a mono-modal classifier. Our experimental study conducted on a\ndataset of 157 patients shows the promising performance of our mono-modal\nclassification model, outperforming six recent methods from the\nstate-of-the-art dedicated for cancer classification. Moreover, the power-law\ncurve, estimated on our experimental data, shows that our classification model\nrequires a reasonable number of additional patients for its training to\npotentially reach identical diagnosis accuracy as IHC technologies.\n","authors":["Bilel Guetarni","Feryal Windal","Halim Benhabiles","Marianne Petit","Romain Dubois","Emmanuelle Leteurtre","Dominique Collard"],"pdf_url":"https://arxiv.org/pdf/2308.01328v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.01308v1","updated":"2023-08-02T17:52:37Z","published":"2023-08-02T17:52:37Z","title":"Masked and Swapped Sequence Modeling for Next Novel Basket\n  Recommendation in Grocery Shopping","summary":"  Next basket recommendation (NBR) is the task of predicting the next set of\nitems based on a sequence of already purchased baskets. It is a recommendation\ntask that has been widely studied, especially in the context of grocery\nshopping. In next basket recommendation (NBR), it is useful to distinguish\nbetween repeat items, i.e., items that a user has consumed before, and explore\nitems, i.e., items that a user has not consumed before. Most NBR work either\nignores this distinction or focuses on repeat items. We formulate the next\nnovel basket recommendation (NNBR) task, i.e., the task of recommending a\nbasket that only consists of novel items, which is valuable for both real-world\napplication and NBR evaluation. We evaluate how existing NBR methods perform on\nthe NNBR task and find that, so far, limited progress has been made w.r.t. the\nNNBR task. To address the NNBR task, we propose a simple bi-directional\ntransformer basket recommendation model (BTBR), which is focused on directly\nmodeling item-to-item correlations within and across baskets instead of\nlearning complex basket representations. To properly train BTBR, we propose and\ninvestigate several masking strategies and training objectives: (i) item-level\nrandom masking, (ii) item-level select masking, (iii) basket-level all masking,\n(iv) basket-level explore masking, and (v) joint masking. In addition, an\nitem-basket swapping strategy is proposed to enrich the item interactions\nwithin the same baskets. We conduct extensive experiments on three open\ndatasets with various characteristics. The results demonstrate the\neffectiveness of BTBR and our masking and swapping strategies for the NNBR\ntask. BTBR with a properly selected masking and swapping strategy can\nsubstantially improve NNBR performance.\n","authors":["Ming Li","Mozhdeh Ariannezhad","Andrew Yates","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2308.01308v1.pdf","comment":"To appear at RecSys'23"},{"id":"http://arxiv.org/abs/2308.01118v1","updated":"2023-08-02T12:58:11Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":"  Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations today's\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and we review existing approaches to detect, quantify and\nmitigate popularity bias in recommender systems. Our survey therefore includes\nboth an overview of the computational metrics used in the literature as well as\na review of the main technical approaches to reduce the bias. We furthermore\ncritically discuss today's literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v1.pdf","comment":"Under review, submitted to UMUAI"},{"id":"http://arxiv.org/abs/2308.01098v1","updated":"2023-08-02T12:05:01Z","published":"2023-08-02T12:05:01Z","title":"Towards Better Query Classification with Multi-Expert Knowledge\n  Condensation in JD Ads Search","summary":"  Search query classification, as an effective way to understand user intents,\nis of great importance in real-world online ads systems. To ensure a lower\nlatency, a shallow model (e.g. FastText) is widely used for efficient online\ninference. However, the representation ability of the FastText model is\ninsufficient, resulting in poor classification performance, especially on some\nlow-frequency queries and tailed categories. Using a deeper and more complex\nmodel (e.g. BERT) is an effective solution, but it will cause a higher online\ninference latency and more expensive computing costs. Thus, how to juggle both\ninference efficiency and classification performance is obviously of great\npractical importance. To overcome this challenge, in this paper, we propose\nknowledge condensation (KC), a simple yet effective knowledge distillation\nframework to boost the classification performance of the online FastText model\nunder strict low latency constraints. Specifically, we propose to train an\noffline BERT model to retrieve more potentially relevant data. Benefiting from\nits powerful semantic representation, more relevant labels not exposed in the\nhistorical data will be added into the training set for better FastText model\ntraining. Moreover, a novel distribution-diverse multi-expert learning strategy\nis proposed to further improve the mining ability of relevant data. By training\nmultiple BERT models from different data distributions, it can respectively\nperform better at high, middle, and low-frequency search queries. The model\nensemble from multi-distribution makes its retrieval ability more powerful. We\nhave deployed two versions of this framework in JD search, and both offline\nexperiments and online A/B testing from multiple datasets have validated the\neffectiveness of the proposed approach.\n","authors":["Kun-Peng Ning","Ming Pang","Zheng Fang","Xue Jiang","Xi-Wei Zhao","Chang-Ping Peng","Zhan-Gang Lin","Jing-He Hu","Jing-Ping Shao"],"pdf_url":"https://arxiv.org/pdf/2308.01098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16773v2","updated":"2023-08-02T08:04:29Z","published":"2023-07-31T15:40:45Z","title":"AsdKB: A Chinese Knowledge Base for the Early Screening and Diagnosis of\n  Autism Spectrum Disorder","summary":"  To easily obtain the knowledge about autism spectrum disorder and help its\nearly screening and diagnosis, we create AsdKB, a Chinese knowledge base on\nautism spectrum disorder. The knowledge base is built on top of various\nsources, including 1) the disease knowledge from SNOMED CT and ICD-10 clinical\ndescriptions on mental and behavioural disorders, 2) the diagnostic knowledge\nfrom DSM-5 and different screening tools recommended by social organizations\nand medical institutes, and 3) the expert knowledge on professional physicians\nand hospitals from the Web. AsdKB contains both ontological and factual\nknowledge, and is accessible as Linked Data at https://w3id.org/asdkb/. The\npotential applications of AsdKB are question answering, auxiliary diagnosis,\nand expert recommendation, and we illustrate them with a prototype which can be\naccessed at http://asdkb.org.cn/.\n","authors":["Tianxing Wu","Xudong Cao","Yipeng Zhu","Feiyue Wu","Tianling Gong","Yuxiang Wang","Shenqi Jing"],"pdf_url":"https://arxiv.org/pdf/2307.16773v2.pdf","comment":"17 pages, Accepted by the Resource Track of ISWC 2023"},{"id":"http://arxiv.org/abs/2308.00909v1","updated":"2023-08-02T02:11:01Z","published":"2023-08-02T02:11:01Z","title":"Rethinking Similarity Search: Embracing Smarter Mechanisms over Smarter\n  Data","summary":"  In this vision paper, we propose a shift in perspective for improving the\neffectiveness of similarity search. Rather than focusing solely on enhancing\nthe data quality, particularly machine learning-generated embeddings, we\nadvocate for a more comprehensive approach that also enhances the underpinning\nsearch mechanisms. We highlight three novel avenues that call for a\nredefinition of the similarity search problem: exploiting implicit data\nstructures and distributions, engaging users in an iterative feedback loop, and\nmoving beyond a single query vector. These novel pathways have gained relevance\nin emerging applications such as large-scale language models, video clip\nretrieval, and data labeling. We discuss the corresponding research challenges\nposed by these new problem areas and share insights from our preliminary\ndiscoveries.\n","authors":["Renzhi Wu","Jingfan Meng","Jie Jeff Xu","Huayi Wang","Kexin Rong"],"pdf_url":"https://arxiv.org/pdf/2308.00909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14233v2","updated":"2023-08-02T02:06:28Z","published":"2023-04-27T14:45:55Z","title":"Large Language Models are Strong Zero-Shot Retriever","summary":"  In this work, we propose a simple method that applies a large language model\n(LLM) to large-scale retrieval in zero-shot scenarios. Our method, the Language\nlanguage model as Retriever (LameR), is built upon no other neural models but\nan LLM, while breaking brute-force combinations of retrievers with LLMs and\nlifting the performance of zero-shot retrieval to be very competitive on\nbenchmark datasets. Essentially, we propose to augment a query with its\npotential answers by prompting LLMs with a composition of the query and the\nquery's in-domain candidates. The candidates, regardless of correct or wrong,\nare obtained by a vanilla retrieval procedure on the target collection. As a\npart of the prompts, they are likely to help LLM generate more precise answers\nby pattern imitation or candidate summarization. Even if all the candidates are\nwrong, the prompts at least make LLM aware of in-collection patterns and\ngenres. Moreover, due to the low performance of a self-supervised retriever,\nthe LLM-based query augmentation becomes less effective as the retriever\nbottlenecks the whole pipeline. Therefore, we propose to leverage a\nnon-parametric lexicon-based method (e.g., BM25) as the retrieval module to\ncapture query-document overlap in a literal fashion. As such, LameR makes the\nretrieval procedure transparent to the LLM, thus circumventing the performance\nbottleneck.\n","authors":["Tao Shen","Guodong Long","Xiubo Geng","Chongyang Tao","Tianyi Zhou","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.14233v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.00894v1","updated":"2023-08-02T01:13:36Z","published":"2023-08-02T01:13:36Z","title":"User-Controllable Recommendation via Counterfactual Retrospective and\n  Prospective Explanations","summary":"  Modern recommender systems utilize users' historical behaviors to generate\npersonalized recommendations. However, these systems often lack user\ncontrollability, leading to diminished user satisfaction and trust in the\nsystems. Acknowledging the recent advancements in explainable recommender\nsystems that enhance users' understanding of recommendation mechanisms, we\npropose leveraging these advancements to improve user controllability. In this\npaper, we present a user-controllable recommender system that seamlessly\nintegrates explainability and controllability within a unified framework. By\nproviding both retrospective and prospective explanations through\ncounterfactual reasoning, users can customize their control over the system by\ninteracting with these explanations.\n  Furthermore, we introduce and assess two attributes of controllability in\nrecommendation systems: the complexity of controllability and the accuracy of\ncontrollability. Experimental evaluations on MovieLens and Yelp datasets\nsubstantiate the effectiveness of our proposed framework. Additionally, our\nexperiments demonstrate that offering users control options can potentially\nenhance recommendation accuracy in the future. Source code and data are\navailable at \\url{https://github.com/chrisjtan/ucr}.\n","authors":["Juntao Tan","Yingqiang Ge","Yan Zhu","Yinglong Xia","Jiebo Luo","Jianchao Ji","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00894v1.pdf","comment":"Accepted for presentation at 26th European Conference on Artificial\n  Intelligence (ECAI2023)"},{"id":"http://arxiv.org/abs/2308.01927v1","updated":"2023-08-02T11:39:19Z","published":"2023-08-02T11:39:19Z","title":"MultiEM: Efficient and Effective Unsupervised Multi-Table Entity\n  Matching","summary":"  Entity Matching (EM), which aims to identify all entity pairs referring to\nthe same real-world entity from relational tables, is one of the most important\ntasks in real-world data management systems. Due to the labeling process of EM\nbeing extremely labor-intensive, unsupervised EM is more applicable than\nsupervised EM in practical scenarios. Traditional unsupervised EM assumes that\nall entities come from two tables; however, it is more common to match entities\nfrom multiple tables in practical applications, that is, multi-table entity\nmatching (multi-table EM). Unfortunately, effective and efficient unsupervised\nmulti-table EM remains under-explored. To fill this gap, this paper formally\nstudies the problem of unsupervised multi-table entity matching and proposes an\neffective and efficient solution, termed as MultiEM. MultiEM is a parallelable\npipeline of enhanced entity representation, table-wise hierarchical merging,\nand density-based pruning. Extensive experimental results on six real-world\nbenchmark datasets demonstrate the superiority of MultiEM in terms of\neffectiveness and efficiency.\n","authors":["Xiaocan Zeng","Pengfei Wang","Yuren Mao","Lu Chen","Xiaoze Liu","Yunjun Gao"],"pdf_url":"https://arxiv.org/pdf/2308.01927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02555v1","updated":"2023-08-02T07:28:08Z","published":"2023-08-02T07:28:08Z","title":"Knowledge-aware Collaborative Filtering with Pre-trained Language Model\n  for Personalized Review-based Rating Prediction","summary":"  Personalized review-based rating prediction aims at leveraging existing\nreviews to model user interests and item characteristics for rating prediction.\nMost of the existing studies mainly encounter two issues. First, the rich\nknowledge contained in the fine-grained aspects of each review and the\nknowledge graph is rarely considered to complement the pure text for better\nmodeling user-item interactions. Second, the power of pre-trained language\nmodels is not carefully studied for personalized review-based rating\nprediction. To address these issues, we propose an approach named\nKnowledge-aware Collaborative Filtering with Pre-trained Language Model\n(KCF-PLM). For the first issue, to utilize rich knowledge, KCF-PLM develops a\ntransformer network to model the interactions of the extracted aspects w.r.t. a\nuser-item pair. For the second issue, to better represent users and items,\nKCF-PLM takes all the historical reviews of a user or an item as input to\npre-trained language models. Moreover, KCF-PLM integrates the transformer\nnetwork and the pre-trained language models through representation propagation\non the knowledge graph and user-item guided attention of the aspect\nrepresentations. Thus KCF-PLM combines review text, aspect, knowledge graph,\nand pre-trained language models together for review-based rating prediction. We\nconduct comprehensive experiments on several public datasets, demonstrating the\neffectiveness of KCF-PLM.\n","authors":["Quanxiu Wang","Xinlei Cao","Jianyong Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02555v1.pdf","comment":"13 pages, accepted by IEEE TKDE"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.01313v1","updated":"2023-08-02T17:57:25Z","published":"2023-08-02T17:57:25Z","title":"More Context, Less Distraction: Visual Classification by Inferring and\n  Conditioning on Contextual Attributes","summary":"  CLIP, as a foundational vision language model, is widely used in zero-shot\nimage classification due to its ability to understand various visual concepts\nand natural language descriptions. However, how to fully leverage CLIP's\nunprecedented human-like understanding capabilities to achieve better zero-shot\nclassification is still an open question. This paper draws inspiration from the\nhuman visual perception process: a modern neuroscience view suggests that in\nclassifying an object, humans first infer its class-independent attributes\n(e.g., background and orientation) which help separate the foreground object\nfrom the background, and then make decisions based on this information.\nInspired by this, we observe that providing CLIP with contextual attributes\nimproves zero-shot classification and mitigates reliance on spurious features.\nWe also observe that CLIP itself can reasonably infer the attributes from an\nimage. With these observations, we propose a training-free, two-step zero-shot\nclassification method named PerceptionCLIP. Given an image, it first infers\ncontextual attributes (e.g., background) and then performs object\nclassification conditioning on them. Our experiments show that PerceptionCLIP\nachieves better generalization, group robustness, and better interpretability.\nFor example, PerceptionCLIP with ViT-L/14 improves the worst group accuracy by\n16.5% on the Waterbirds dataset and by 3.5% on CelebA.\n","authors":["Bang An","Sicheng Zhu","Michael-Andrei Panaitescu-Liess","Chaithanya Kumar Mummadi","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2308.01313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01312v1","updated":"2023-08-02T17:56:29Z","published":"2023-08-02T17:56:29Z","title":"Lode Encoder: AI-constrained co-creativity","summary":"  We present Lode Encoder, a gamified mixed-initiative level creation system\nfor the classic platform-puzzle game Lode Runner. The system is built around\nseveral autoencoders which are trained on sets of Lode Runner levels. When fed\nwith the user's design, each autoencoder produces a version of that design\nwhich is closer in style to the levels that it was trained on. The Lode Encoder\ninterface allows the user to build and edit levels through 'painting' from the\nsuggestions provided by the autoencoders. Crucially, in order to encourage\ndesigners to explore new possibilities, the system does not include more\ntraditional editing tools. We report on the system design and training\nprocedure, as well as on the evolution of the system itself and user tests.\n","authors":["Debosmita Bhaumik","Ahmed Khalifa","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.01312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01308v1","updated":"2023-08-02T17:52:37Z","published":"2023-08-02T17:52:37Z","title":"Masked and Swapped Sequence Modeling for Next Novel Basket\n  Recommendation in Grocery Shopping","summary":"  Next basket recommendation (NBR) is the task of predicting the next set of\nitems based on a sequence of already purchased baskets. It is a recommendation\ntask that has been widely studied, especially in the context of grocery\nshopping. In next basket recommendation (NBR), it is useful to distinguish\nbetween repeat items, i.e., items that a user has consumed before, and explore\nitems, i.e., items that a user has not consumed before. Most NBR work either\nignores this distinction or focuses on repeat items. We formulate the next\nnovel basket recommendation (NNBR) task, i.e., the task of recommending a\nbasket that only consists of novel items, which is valuable for both real-world\napplication and NBR evaluation. We evaluate how existing NBR methods perform on\nthe NNBR task and find that, so far, limited progress has been made w.r.t. the\nNNBR task. To address the NNBR task, we propose a simple bi-directional\ntransformer basket recommendation model (BTBR), which is focused on directly\nmodeling item-to-item correlations within and across baskets instead of\nlearning complex basket representations. To properly train BTBR, we propose and\ninvestigate several masking strategies and training objectives: (i) item-level\nrandom masking, (ii) item-level select masking, (iii) basket-level all masking,\n(iv) basket-level explore masking, and (v) joint masking. In addition, an\nitem-basket swapping strategy is proposed to enrich the item interactions\nwithin the same baskets. We conduct extensive experiments on three open\ndatasets with various characteristics. The results demonstrate the\neffectiveness of BTBR and our masking and swapping strategies for the NNBR\ntask. BTBR with a properly selected masking and swapping strategy can\nsubstantially improve NNBR performance.\n","authors":["Ming Li","Mozhdeh Ariannezhad","Andrew Yates","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2308.01308v1.pdf","comment":"To appear at RecSys'23"},{"id":"http://arxiv.org/abs/2302.00102v3","updated":"2023-08-02T17:16:48Z","published":"2023-01-31T21:08:58Z","title":"Towards Detecting Harmful Agendas in News Articles","summary":"  Manipulated news online is a growing problem which necessitates the use of\nautomated systems to curtail its spread. We argue that while misinformation and\ndisinformation detection have been studied, there has been a lack of investment\nin the important open challenge of detecting harmful agendas in news articles;\nidentifying harmful agendas is critical to flag news campaigns with the\ngreatest potential for real world harm. Moreover, due to real concerns around\ncensorship, harmful agenda detectors must be interpretable to be effective. In\nthis work, we propose this new task and release a dataset, NewsAgendas, of\nannotated news articles for agenda identification. We show how interpretable\nsystems can be effective on this task and demonstrate that they can perform\ncomparably to black-box models.\n","authors":["Melanie Subbiah","Amrita Bhattacharjee","Yilun Hua","Tharindu Kumarage","Huan Liu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2302.00102v3.pdf","comment":"Camera-ready for ACL-WASSA 2023. First two authors contributed\n  equally"},{"id":"http://arxiv.org/abs/2308.01274v1","updated":"2023-08-02T16:57:19Z","published":"2023-08-02T16:57:19Z","title":"BRNES: Enabling Security and Privacy-aware Experience Sharing in\n  Multiagent Robotic and Autonomous Systems","summary":"  Although experience sharing (ES) accelerates multiagent reinforcement\nlearning (MARL) in an advisor-advisee framework, attempts to apply ES to\ndecentralized multiagent systems have so far relied on trusted environments and\noverlooked the possibility of adversarial manipulation and inference.\nNevertheless, in a real-world setting, some Byzantine attackers, disguised as\nadvisors, may provide false advice to the advisee and catastrophically degrade\nthe overall learning performance. Also, an inference attacker, disguised as an\nadvisee, may conduct several queries to infer the advisors' private information\nand make the entire ES process questionable in terms of privacy leakage. To\naddress and tackle these issues, we propose a novel MARL framework (BRNES) that\nheuristically selects a dynamic neighbor zone for each advisee at each learning\nstep and adopts a weighted experience aggregation technique to reduce Byzantine\nattack impact. Furthermore, to keep the agent's private information safe from\nadversarial inference attacks, we leverage the local differential privacy\n(LDP)-induced noise during the ES process. Our experiments show that our\nframework outperforms the state-of-the-art in terms of the steps to goal,\nobtained reward, and time to goal metrics. Particularly, our evaluation shows\nthat the proposed framework is 8.32x faster than the current non-private\nframeworks and 1.41x faster than the private frameworks in an adversarial\nsetting.\n","authors":["Md Tamjid Hossain","Hung Manh La","Shahriar Badsha","Anton Netchaev"],"pdf_url":"https://arxiv.org/pdf/2308.01274v1.pdf","comment":"8 pages, 6 figures, 3 tables, Accepted for publication in the\n  proceeding of The 2023 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2023), Oct 01-05, 2023, Detroit, Michigan, USA"},{"id":"http://arxiv.org/abs/2305.11322v2","updated":"2023-08-02T16:56:50Z","published":"2023-05-18T22:11:04Z","title":"Knowing When to Stop: Delay-Adaptive Spiking Neural Network Classifiers\n  with Reliability Guarantees","summary":"  Spiking neural networks (SNNs) process time-series data via internal\nevent-driven neural dynamics whose energy consumption depends on the number of\nspikes exchanged between neurons over the course of the input presentation.\nTypically, decisions are produced after the entire input sequence has been\nprocessed, resulting in latency and energy consumption levels that are fairly\nuniform across inputs. However, as explored in recent work, SNNs can produce an\nearly decision when the SNN model is sufficiently ``confident'', adapting delay\nand energy consumption to the difficulty of each example. Existing techniques\nare based on heuristic measures of confidence that do not provide reliability\nguarantees, potentially exiting too early. In this paper, we introduce a novel\ndelay-adaptive SNN-based inference methodology that, wrapping around any\npre-trained SNN classifier, provides guaranteed reliability for the decisions\nproduced at input-dependent stopping times. The approach, dubbed SpikeCP,\nleverages tools from conformal prediction (CP), and it entails minimal\ncomplexity increase as compared to the underlying SNN, requiring only\nadditional thresholding and counting operations at run time. SpikeCP is also\nextended to integrate a CP-aware training phase that targets delay performance.\nVariants of CP based on alternative confidence correction schemes, from\nBonferroni to Simes, are explored, and extensive experiments are described\nusing the MNIST-DVS data set.\n","authors":["Jiechen Chen","Sangwoo Park","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2305.11322v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2306.01940v2","updated":"2023-08-02T16:55:29Z","published":"2023-06-02T22:47:18Z","title":"Sampling binary sparse coding QUBO models using a spiking neuromorphic\n  processor","summary":"  We consider the problem of computing a sparse binary representation of an\nimage. To be precise, given an image and an overcomplete, non-orthonormal\nbasis, we aim to find a sparse binary vector indicating the minimal set of\nbasis vectors that when added together best reconstruct the given input. We\nformulate this problem with an $L_2$ loss on the reconstruction error, and an\n$L_0$ (or, equivalently, an $L_1$) loss on the binary vector enforcing\nsparsity. This yields a so-called Quadratic Unconstrained Binary Optimization\n(QUBO) problem, whose solution is generally NP-hard to find. The contribution\nof this work is twofold. First, the method of unsupervised and unnormalized\ndictionary feature learning for a desired sparsity level to best match the data\nis presented. Second, the binary sparse coding problem is then solved on the\nLoihi 1 neuromorphic chip by the use of stochastic networks of neurons to\ntraverse the non-convex energy landscape. The solutions are benchmarked against\nthe classical heuristic simulated annealing. We demonstrate neuromorphic\ncomputing is suitable for sampling low energy solutions of binary sparse coding\nQUBO models, and although Loihi 1 is capable of sampling very sparse solutions\nof the QUBO models, there needs to be improvement in the implementation in\norder to be competitive with simulated annealing.\n","authors":["Kyle Henke","Elijah Pelofske","Georg Hahn","Garrett T. Kenyon"],"pdf_url":"https://arxiv.org/pdf/2306.01940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01271v1","updated":"2023-08-02T16:52:56Z","published":"2023-08-02T16:52:56Z","title":"A Probabilistic Approach to Self-Supervised Learning using Cyclical\n  Stochastic Gradient MCMC","summary":"  In this paper we present a practical Bayesian self-supervised learning method\nwith Cyclical Stochastic Gradient Hamiltonian Monte Carlo (cSGHMC). Within this\nframework, we place a prior over the parameters of a self-supervised learning\nmodel and use cSGHMC to approximate the high dimensional and multimodal\nposterior distribution over the embeddings. By exploring an expressive\nposterior over the embeddings, Bayesian self-supervised learning produces\ninterpretable and diverse representations. Marginalizing over these\nrepresentations yields a significant gain in performance, calibration and\nout-of-distribution detection on a variety of downstream classification tasks.\nWe provide experimental results on multiple classification tasks on four\nchallenging datasets. Moreover, we demonstrate the effectiveness of the\nproposed method in out-of-distribution detection using the SVHN and CIFAR-10\ndatasets.\n","authors":["Masoumeh Javanbakhat","Christoph Lippert"],"pdf_url":"https://arxiv.org/pdf/2308.01271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08360v2","updated":"2023-08-02T16:49:37Z","published":"2023-01-19T23:36:23Z","title":"Domain-adapted Learning and Imitation: DRL for Power Arbitrage","summary":"  In this paper, we discuss the Dutch power market, which is comprised of a\nday-ahead market and an intraday balancing market that operates like an\nauction. Due to fluctuations in power supply and demand, there is often an\nimbalance that leads to different prices in the two markets, providing an\nopportunity for arbitrage. To address this issue, we restructure the problem\nand propose a collaborative dual-agent reinforcement learning approach for this\nbi-level simulation and optimization of European power arbitrage trading. We\nalso introduce two new implementations designed to incorporate domain-specific\nknowledge by imitating the trading behaviours of power traders. By utilizing\nreward engineering to imitate domain expertise, we are able to reform the\nreward system for the RL agent, which improves convergence during training and\nenhances overall performance. Additionally, the tranching of orders increases\nbidding success rates and significantly boosts profit and loss (P&L). Our study\ndemonstrates that by leveraging domain expertise in a general learning problem,\nthe performance can be improved substantially, and the final integrated\napproach leads to a three-fold improvement in cumulative P&L compared to the\noriginal agent. Furthermore, our methodology outperforms the highest benchmark\npolicy by around 50% while maintaining efficient computational performance.\n","authors":["Yuanrong Wang","Vignesh Raja Swaminathan","Nikita P. Granger","Carlos Ros Perez","Christian Michler"],"pdf_url":"https://arxiv.org/pdf/2301.08360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.05877v2","updated":"2023-08-02T16:27:26Z","published":"2022-02-07T20:38:28Z","title":"Fabricated Flips: Poisoning Federated Learning without Data","summary":"  Attacks on Federated Learning (FL) can severely reduce the quality of the\ngenerated models and limit the usefulness of this emerging learning paradigm\nthat enables on-premise decentralized learning. However, existing untargeted\nattacks are not practical for many scenarios as they assume that i) the\nattacker knows every update of benign clients, or ii) the attacker has a large\ndataset to locally train updates imitating benign parties. In this paper, we\npropose a data-free untargeted attack (DFA) that synthesizes malicious data to\ncraft adversarial models without eavesdropping on the transmission of benign\nclients at all or requiring a large quantity of task-specific training data. We\ndesign two variants of DFA, namely DFA-R and DFA-G, which differ in how they\ntrade off stealthiness and effectiveness. Specifically, DFA-R iteratively\noptimizes a malicious data layer to minimize the prediction confidence of all\noutputs of the global model, whereas DFA-G interactively trains a malicious\ndata generator network by steering the output of the global model toward a\nparticular class. Experimental results on Fashion-MNIST, Cifar-10, and SVHN\nshow that DFA, despite requiring fewer assumptions than existing attacks,\nachieves similar or even higher attack success rate than state-of-the-art\nuntargeted attacks against various state-of-the-art defense mechanisms.\nConcretely, they can evade all considered defense mechanisms in at least 50% of\nthe cases for CIFAR-10 and often reduce the accuracy by more than a factor of\n2. Consequently, we design REFD, a defense specifically crafted to protect\nagainst data-free attacks. REFD leverages a reference dataset to detect updates\nthat are biased or have a low confidence. It greatly improves upon existing\ndefenses by filtering out the malicious updates and achieves high global model\naccuracy\n","authors":["Jiyue Huang","Zilong Zhao","Lydia Y. Chen","Stefanie Roos"],"pdf_url":"https://arxiv.org/pdf/2202.05877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08364v2","updated":"2023-08-02T16:09:56Z","published":"2023-07-17T10:02:01Z","title":"Q(D)O-ES: Population-based Quality (Diversity) Optimisation for Post Hoc\n  Ensemble Selection in AutoML","summary":"  Automated machine learning (AutoML) systems commonly ensemble models post hoc\nto improve predictive performance, typically via greedy ensemble selection\n(GES). However, we believe that GES may not always be optimal, as it performs a\nsimple deterministic greedy search. In this work, we introduce two novel\npopulation-based ensemble selection methods, QO-ES and QDO-ES, and compare them\nto GES. While QO-ES optimises solely for predictive performance, QDO-ES also\nconsiders the diversity of ensembles within the population, maintaining a\ndiverse set of well-performing ensembles during optimisation based on ideas of\nquality diversity optimisation. The methods are evaluated using 71\nclassification datasets from the AutoML benchmark, demonstrating that QO-ES and\nQDO-ES often outrank GES, albeit only statistically significant on validation\ndata. Our results further suggest that diversity can be beneficial for post hoc\nensembling but also increases the risk of overfitting.\n","authors":["Lennart Purucker","Lennart Schneider","Marie Anastacio","Joeran Beel","Bernd Bischl","Holger Hoos"],"pdf_url":"https://arxiv.org/pdf/2307.08364v2.pdf","comment":"10 pages main paper, 24 pages references and appendix, 4 figures, 16\n  subfigures, 13 tables, to be published in: International Conference on\n  Automated Machine Learning 2023; affiliations corrected. arXiv admin note:\n  text overlap with arXiv:2307.00286"},{"id":"http://arxiv.org/abs/2308.01246v1","updated":"2023-08-02T16:00:39Z","published":"2023-08-02T16:00:39Z","title":"Tirtha -- An Automated Platform to Crowdsource Images and Create 3D\n  Models of Heritage Sites","summary":"  Digital preservation of Cultural Heritage (CH) sites is crucial to protect\nthem against damage from natural disasters or human activities. Creating 3D\nmodels of CH sites has become a popular method of digital preservation thanks\nto advancements in computer vision and photogrammetry. However, the process is\ntime-consuming, expensive, and typically requires specialized equipment and\nexpertise, posing challenges in resource-limited developing countries.\nAdditionally, the lack of an open repository for 3D models hinders research and\npublic engagement with their heritage. To address these issues, we propose\nTirtha, a web platform for crowdsourcing images of CH sites and creating their\n3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and\nMulti-View Stereo (MVS) techniques. It is modular, extensible and\ncost-effective, allowing for the incorporation of new techniques as\nphotogrammetry advances. Tirtha is accessible through a web interface at\nhttps://tirtha.niser.ac.in and can be deployed on-premise or in a cloud\nenvironment. In our case studies, we demonstrate the pipeline's effectiveness\nby creating 3D models of temples in Odisha, India, using crowdsourced images.\nThese models are available for viewing, interaction, and download on the Tirtha\nwebsite. Our work aims to provide a dataset of crowdsourced images and 3D\nreconstructions for research in computer vision, heritage conservation, and\nrelated domains. Overall, Tirtha is a step towards democratizing digital\npreservation, primarily in resource-limited developing countries.\n","authors":["Jyotirmaya Shivottam","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.01246v1.pdf","comment":"Accepted at The 28th International ACM Conference on 3D Web\n  Technology (Web3D 2023)"},{"id":"http://arxiv.org/abs/2105.02589v2","updated":"2023-08-02T15:57:02Z","published":"2021-05-06T11:23:26Z","title":"Bandit based centralized matching in two-sided markets for peer to peer\n  lending","summary":"  Sequential fundraising in two sided online platforms enable peer to peer\nlending by sequentially bringing potential contributors, each of whose\ndecisions impact other contributors in the market. However, understanding the\ndynamics of sequential contributions in online platforms for peer lending has\nbeen an open ended research question. The centralized investment mechanism in\nthese platforms makes it difficult to understand the implicit competition that\nborrowers face from a single lender at any point in time. Matching markets are\na model of pairing agents where the preferences of agents from both sides in\nterms of their preferred pairing for transactions can allow to decentralize the\nmarket. We study investment designs in two sided platforms using matching\nmarkets when the investors or lenders also face restrictions on the investments\nbased on borrower preferences. This situation creates an implicit competition\namong the lenders in addition to the existing borrower competition, especially\nwhen the lenders are uncertain about their standing in the market and thereby\nthe probability of their investments being accepted or the borrower loan\nrequests for projects reaching the reserve price. We devise a technique based\non sequential decision making that allows the lenders to adjust their choices\nbased on the dynamics of uncertainty from competition over time. We simulate\ntwo sided market matchings in a sequential decision framework and show the\ndynamics of the lender regret amassed compared to the optimal borrower-lender\nmatching and find that the lender regret depends on the initial preferences set\nby the lenders which could affect their learning over decision making steps.\n","authors":["Soumajyoti Sarkar"],"pdf_url":"https://arxiv.org/pdf/2105.02589v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2011.04400"},{"id":"http://arxiv.org/abs/2303.01584v2","updated":"2023-08-02T15:38:37Z","published":"2023-03-02T21:16:53Z","title":"Evolutionary Augmentation Policy Optimization for Self-supervised\n  Learning","summary":"  Self-supervised Learning (SSL) is a machine learning algorithm for\npretraining Deep Neural Networks (DNNs) without requiring manually labeled\ndata. The central idea of this learning technique is based on an auxiliary\nstage aka pretext task in which labeled data are created automatically through\ndata augmentation and exploited for pretraining the DNN. However, the effect of\neach pretext task is not well studied or compared in the literature. In this\npaper, we study the contribution of augmentation operators on the performance\nof self supervised learning algorithms in a constrained settings. We propose an\nevolutionary search method for optimization of data augmentation pipeline in\npretext tasks and measure the impact of augmentation operators in several SOTA\nSSL algorithms. By encoding different combination of augmentation operators in\nchromosomes we seek the optimal augmentation policies through an evolutionary\noptimization mechanism. We further introduce methods for analyzing and\nexplaining the performance of optimized SSL algorithms. Our results indicate\nthat our proposed method can find solutions that outperform the accuracy of\nclassification of SSL algorithms which confirms the influence of augmentation\npolicy choice on the overall performance of SSL algorithms. We also compare\noptimal SSL solutions found by our evolutionary search mechanism and show the\neffect of batch size in the pretext task on two visual datasets.\n","authors":["Noah Barrett","Zahra Sadeghi","Stan Matwin"],"pdf_url":"https://arxiv.org/pdf/2303.01584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01223v1","updated":"2023-08-02T15:29:22Z","published":"2023-08-02T15:29:22Z","title":"Do Multilingual Language Models Think Better in English?","summary":"  Translate-test is a popular technique to improve the performance of\nmultilingual language models. This approach works by translating the input into\nEnglish using an external machine translation system, and running inference\nover the translated input. However, these improvements can be attributed to the\nuse of a separate translation system, which is typically trained on large\namounts of parallel data not seen by the language model. In this work, we\nintroduce a new approach called self-translate, which overcomes the need of an\nexternal translation system by leveraging the few-shot translation capabilities\nof multilingual language models. Experiments over 5 tasks show that\nself-translate consistently outperforms direct inference, demonstrating that\nlanguage models are unable to leverage their full multilingual potential when\nprompted in non-English languages. Our code is available at\nhttps://github.com/juletx/self-translate.\n","authors":["Julen Etxaniz","Gorka Azkune","Aitor Soroa","Oier Lopez de Lacalle","Mikel Artetxe"],"pdf_url":"https://arxiv.org/pdf/2308.01223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01222v1","updated":"2023-08-02T15:28:10Z","published":"2023-08-02T15:28:10Z","title":"Calibration in Deep Learning: A Survey of the State-of-the-Art","summary":"  Calibrating deep neural models plays an important role in building reliable,\nrobust AI systems in safety-critical applications. Recent work has shown that\nmodern neural networks that possess high predictive capability are poorly\ncalibrated and produce unreliable model predictions. Though deep learning\nmodels achieve remarkable performance on various benchmarks, the study of model\ncalibration and reliability is relatively underexplored. Ideal deep models\nshould have not only high predictive performance but also be well calibrated.\nThere have been some recent methods proposed to calibrate deep models by using\ndifferent mechanisms. In this survey, we review the state-of-the-art\ncalibration methods and provide an understanding of their principles for\nperforming model calibration. First, we start with the definition of model\ncalibration and explain the root causes of model miscalibration. Then we\nintroduce the key metrics that can measure this aspect. It is followed by a\nsummary of calibration methods that we roughly classified into four categories:\npost-hoc calibration, regularization methods, uncertainty estimation, and\ncomposition methods. We also covered some recent advancements in calibrating\nlarge models, particularly large language models (LLMs). Finally, we discuss\nsome open issues, challenges, and potential directions.\n","authors":["Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01220v1","updated":"2023-08-02T15:26:08Z","published":"2023-08-02T15:26:08Z","title":"Using ScrutinAI for Visual Inspection of DNN Performance in a Medical\n  Use Case","summary":"  Our Visual Analytics (VA) tool ScrutinAI supports human analysts to\ninvestigate interactively model performanceand data sets. Model performance\ndepends on labeling quality to a large extent. In particular in medical\nsettings, generation of high quality labels requires in depth expert knowledge\nand is very costly. Often, data sets are labeled by collecting opinions of\ngroups of experts. We use our VA tool to analyse the influence of label\nvariations between different experts on the model performance. ScrutinAI\nfacilitates to perform a root cause analysis that distinguishes weaknesses of\ndeep neural network (DNN) models caused by varying or missing labeling quality\nfrom true weaknesses. We scrutinize the overall detection of intracranial\nhemorrhages and the more subtle differentiation between subtypes in a publicly\navailable data set.\n","authors":["Rebekka Görge","Elena Haedecke","Michael Mock"],"pdf_url":"https://arxiv.org/pdf/2308.01220v1.pdf","comment":"Accepted at AAAI Spring Symposium 2023 AITA: AI Trustworthiness\n  Assessment"},{"id":"http://arxiv.org/abs/2308.01210v1","updated":"2023-08-02T15:12:56Z","published":"2023-08-02T15:12:56Z","title":"Global Hierarchical Neural Networks using Hierarchical Softmax","summary":"  This paper presents a framework in which hierarchical softmax is used to\ncreate a global hierarchical classifier. The approach is applicable for any\nclassification task where there is a natural hierarchy among classes. We show\nempirical results on four text classification datasets. In all datasets the\nhierarchical softmax improved on the regular softmax used in a flat classifier\nin terms of macro-F1 and macro-recall. In three out of four datasets\nhierarchical softmax achieved a higher micro-accuracy and macro-precision.\n","authors":["Jetze Schuurmans","Flavius Frasincar"],"pdf_url":"https://arxiv.org/pdf/2308.01210v1.pdf","comment":"Submitted to the 35th Symposium on Applied Computing (SAC'20,\n  https://www.sigapp.org/sac/sac2020/), to the Machine Learning and its\n  Applications track (MLA, https://sites.google.com/view/acmsac2020/)"},{"id":"http://arxiv.org/abs/2308.01184v1","updated":"2023-08-02T14:48:25Z","published":"2023-08-02T14:48:25Z","title":"Generative Noisy-Label Learning by Implicit Dicriminative Approximation\n  with Partial Label Prior","summary":"  The learning with noisy labels has been addressed with both discriminative\nand generative models. Although discriminative models have dominated the field\ndue to their simpler modeling and more efficient computational training\nprocesses, generative models offer a more effective means of disentangling\nclean and noisy labels and improving the estimation of the label transition\nmatrix. However, generative approaches maximize the joint likelihood of noisy\nlabels and data using a complex formulation that only indirectly optimizes the\nmodel of interest associating data and clean labels. Additionally, these\napproaches rely on generative models that are challenging to train and tend to\nuse uninformative clean label priors. In this paper, we propose a new\ngenerative noisy-label learning approach that addresses these three issues.\nFirst, we propose a new model optimisation that directly associates data and\nclean labels. Second, the generative model is implicitly estimated using a\ndiscriminative model, eliminating the inefficient training of a generative\nmodel. Third, we propose a new informative label prior inspired by partial\nlabel learning as supervision signal for noisy label learning. Extensive\nexperiments on several noisy-label benchmarks demonstrate that our generative\nmodel provides state-of-the-art results while maintaining a similar\ncomputational complexity as discriminative models.\n","authors":["Fengbei Liu","Yuanhong Chen","Chong Wang","Yuyuan Liu","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2308.01184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13055v3","updated":"2023-08-02T14:30:57Z","published":"2023-07-24T18:05:22Z","title":"MARIO: Model Agnostic Recipe for Improving OOD Generalization of Graph\n  Contrastive Learning","summary":"  In this work, we investigate the problem of out-of-distribution (OOD)\ngeneralization for unsupervised learning methods on graph data. This scenario\nis particularly challenging because graph neural networks (GNNs) have been\nshown to be sensitive to distributional shifts, even when labels are available.\nTo address this challenge, we propose a \\underline{M}odel-\\underline{A}gnostic\n\\underline{R}ecipe for \\underline{I}mproving \\underline{O}OD generalizability\nof unsupervised graph contrastive learning methods, which we refer to as MARIO.\nMARIO introduces two principles aimed at developing distributional-shift-robust\ngraph contrastive methods to overcome the limitations of existing frameworks:\n(i) Information Bottleneck (IB) principle for achieving generalizable\nrepresentations and (ii) Invariant principle that incorporates adversarial data\naugmentation to obtain invariant representations. To the best of our knowledge,\nthis is the first work that investigates the OOD generalization problem of\ngraph contrastive learning, with a specific focus on node-level tasks. Through\nextensive experiments, we demonstrate that our method achieves state-of-the-art\nperformance on the OOD test set, while maintaining comparable performance on\nthe in-distribution test set when compared to existing approaches. The source\ncode for our method can be found at: https://github.com/ZhuYun97/MARIO\n","authors":["Yun Zhu","Haizhou Shi","Zhenshuo Zhang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2307.13055v3.pdf","comment":"21 pages, 15 figures"},{"id":"http://arxiv.org/abs/2308.01170v1","updated":"2023-08-02T14:16:22Z","published":"2023-08-02T14:16:22Z","title":"Direct Gradient Temporal Difference Learning","summary":"  Off-policy learning enables a reinforcement learning (RL) agent to reason\ncounterfactually about policies that are not executed and is one of the most\nimportant ideas in RL. It, however, can lead to instability when combined with\nfunction approximation and bootstrapping, two arguably indispensable\ningredients for large-scale reinforcement learning. This is the notorious\ndeadly triad. Gradient Temporal Difference (GTD) is one powerful tool to solve\nthe deadly triad. Its success results from solving a doubling sampling issue\nindirectly with weight duplication or Fenchel duality. In this paper, we\ninstead propose a direct method to solve the double sampling issue by simply\nusing two samples in a Markovian data stream with an increasing gap. The\nresulting algorithm is as computationally efficient as GTD but gets rid of\nGTD's extra weights. The only price we pay is a logarithmically increasing\nmemory as time progresses. We provide both asymptotic and finite sample\nanalysis, where the convergence rate is on-par with the canonical on-policy\ntemporal difference learning. Key to our analysis is a novel refined\ndiscretization of limiting ODEs.\n","authors":["Xiaochi Qian","Shangtong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01170v1.pdf","comment":"Submitted to JMLR in Apr 2023"},{"id":"http://arxiv.org/abs/2308.01157v1","updated":"2023-08-02T13:59:35Z","published":"2023-08-02T13:59:35Z","title":"LLMs Understand Glass-Box Models, Discover Surprises, and Suggest\n  Repairs","summary":"  We show that large language models (LLMs) are remarkably good at working with\ninterpretable models that decompose complex outcomes into univariate\ngraph-represented components. By adopting a hierarchical approach to reasoning,\nLLMs can provide comprehensive model-level summaries without ever requiring the\nentire model to fit in context. This approach enables LLMs to apply their\nextensive background knowledge to automate common tasks in data science such as\ndetecting anomalies that contradict prior knowledge, describing potential\nreasons for the anomalies, and suggesting repairs that would remove the\nanomalies. We use multiple examples in healthcare to demonstrate the utility of\nthese new capabilities of LLMs, with particular emphasis on Generalized\nAdditive Models (GAMs). Finally, we present the package $\\texttt{TalkToEBM}$ as\nan open-source LLM-GAM interface.\n","authors":["Benjamin J. Lengerich","Sebastian Bordt","Harsha Nori","Mark E. Nunnally","Yin Aphinyanaphongs","Manolis Kellis","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2308.01157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01258v3","updated":"2023-08-02T13:52:37Z","published":"2022-11-02T16:39:42Z","title":"Instance-Dependent Generalization Bounds via Optimal Transport","summary":"  Existing generalization bounds fail to explain crucial factors that drive\ngeneralization of modern neural networks. Since such bounds often hold\nuniformly over all parameters, they suffer from over-parametrization, and fail\nto account for the strong inductive bias of initialization and stochastic\ngradient descent. As an alternative, we propose a novel optimal transport\ninterpretation of the generalization problem. This allows us to derive\ninstance-dependent generalization bounds that depend on the local Lipschitz\nregularity of the earned prediction function in the data space. Therefore, our\nbounds are agnostic to the parametrization of the model and work well when the\nnumber of training samples is much smaller than the number of parameters. With\nsmall modifications, our approach yields accelerated rates for data on\nlow-dimensional manifolds, and guarantees under distribution shifts. We\nempirically analyze our generalization bounds for neural networks, showing that\nthe bound values are meaningful and capture the effect of popular\nregularization methods during training.\n","authors":["Songyan Hou","Parnian Kassraie","Anastasis Kratsios","Jonas Rothfuss","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2211.01258v3.pdf","comment":"50 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.05527v2","updated":"2023-08-02T13:49:32Z","published":"2023-04-11T22:45:18Z","title":"Black Box Variational Inference with a Deterministic Objective: Faster,\n  More Accurate, and Even More Black Box","summary":"  Automatic differentiation variational inference (ADVI) offers fast and\neasy-to-use posterior approximation in multiple modern probabilistic\nprogramming languages. However, its stochastic optimizer lacks clear\nconvergence criteria and requires tuning parameters. Moreover, ADVI inherits\nthe poor posterior uncertainty estimates of mean-field variational Bayes\n(MFVB). We introduce ``deterministic ADVI'' (DADVI) to address these issues.\nDADVI replaces the intractable MFVB objective with a fixed Monte Carlo\napproximation, a technique known in the stochastic optimization literature as\nthe ``sample average approximation'' (SAA). By optimizing an approximate but\ndeterministic objective, DADVI can use off-the-shelf second-order optimization,\nand, unlike standard mean-field ADVI, is amenable to more accurate posterior\ncovariances via linear response (LR). In contrast to existing worst-case\ntheory, we show that, on certain classes of common statistical problems, DADVI\nand the SAA can perform well with relatively few samples even in very high\ndimensions, though we also show that such favorable results cannot extend to\nvariational approximations that are too expressive relative to mean-field ADVI.\nWe show on a variety of real-world problems that DADVI reliably finds good\nsolutions with default settings (unlike ADVI) and, together with LR\ncovariances, is typically faster and more accurate than standard ADVI.\n","authors":["Ryan Giordano","Martin Ingram","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2304.05527v2.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2308.01140v1","updated":"2023-08-02T13:31:41Z","published":"2023-08-02T13:31:41Z","title":"DySTreSS: Dynamically Scaled Temperature in Self-Supervised Contrastive\n  Learning","summary":"  In contemporary self-supervised contrastive algorithms like SimCLR, MoCo,\netc., the task of balancing attraction between two semantically similar samples\nand repulsion between two samples from different classes is primarily affected\nby the presence of hard negative samples. While the InfoNCE loss has been shown\nto impose penalties based on hardness, the temperature hyper-parameter is the\nkey to regulating the penalties and the trade-off between uniformity and\ntolerance. In this work, we focus our attention to improve the performance of\nInfoNCE loss in SSL by studying the effect of temperature hyper-parameter\nvalues. We propose a cosine similarity-dependent temperature scaling function\nto effectively optimize the distribution of the samples in the feature space.\nWe further analyze the uniformity and tolerance metrics to investigate the\noptimal regions in the cosine similarity space for better optimization.\nAdditionally, we offer a comprehensive examination of the behavior of local and\nglobal structures in the feature space throughout the pre-training phase, as\nthe temperature varies. Experimental evidence shows that the proposed framework\noutperforms or is at par with the contrastive loss-based SSL algorithms. We\nbelieve our work (DySTreSS) on temperature scaling in SSL provides a foundation\nfor future research in contrastive learning.\n","authors":["Siladittya Manna","Soumitri Chattopadhyay","Rakesh Dey","Saumik Bhattacharya","Umapada Pal"],"pdf_url":"https://arxiv.org/pdf/2308.01140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01139v1","updated":"2023-08-02T13:30:33Z","published":"2023-08-02T13:30:33Z","title":"Dynamic Privacy Allocation for Locally Differentially Private Federated\n  Learning with Composite Objectives","summary":"  This paper proposes a locally differentially private federated learning\nalgorithm for strongly convex but possibly nonsmooth problems that protects the\ngradients of each worker against an honest but curious server. The proposed\nalgorithm adds artificial noise to the shared information to ensure privacy and\ndynamically allocates the time-varying noise variance to minimize an upper\nbound of the optimization error subject to a predefined privacy budget\nconstraint. This allows for an arbitrarily large but finite number of\niterations to achieve both privacy protection and utility up to a neighborhood\nof the optimal solution, removing the need for tuning the number of iterations.\nNumerical results show the superiority of the proposed algorithm over\nstate-of-the-art methods.\n","authors":["Jiaojiao Zhang","Dominik Fay","Mikael Johansson"],"pdf_url":"https://arxiv.org/pdf/2308.01139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01138v1","updated":"2023-08-02T13:29:31Z","published":"2023-08-02T13:29:31Z","title":"Can We Transfer Noise Patterns? An Multi-environment Spectrum Analysis\n  Model Using Generated Cases","summary":"  Spectrum analysis systems in online water quality testing are designed to\ndetect types and concentrations of pollutants and enable regulatory agencies to\nrespond promptly to pollution incidents. However, spectral data-based testing\ndevices suffer from complex noise patterns when deployed in non-laboratory\nenvironments. To make the analysis model applicable to more environments, we\npropose a noise patterns transferring model, which takes the spectrum of\nstandard water samples in different environments as cases and learns the\ndifferences in their noise patterns, thus enabling noise patterns to transfer\nto unknown samples. Unfortunately, the inevitable sample-level baseline noise\nmakes the model unable to obtain the paired data that only differ in\ndataset-level environmental noise. To address the problem, we generate a\nsample-to-sample case-base to exclude the interference of sample-level noise on\ndataset-level noise learning, enhancing the system's learning performance.\nExperiments on spectral data with different background noises demonstrate the\ngood noise-transferring ability of the proposed method against baseline systems\nranging from wavelet denoising, deep neural networks, and generative models.\nFrom this research, we posit that our method can enhance the performance of DL\nmodels by generating high-quality cases. The source code is made publicly\navailable online at https://github.com/Magnomic/CNST.\n","authors":["Haiwen Du","Zheng Ju","Yu An","Honghui Du","Dongjie Zhu","Zhaoshuo Tian","Aonghus Lawlor","Ruihai Dong"],"pdf_url":"https://arxiv.org/pdf/2308.01138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01137v1","updated":"2023-08-02T13:28:44Z","published":"2023-08-02T13:28:44Z","title":"Multi-task learning for classification, segmentation, reconstruction,\n  and detection on chest CT scans","summary":"  Lung cancer and covid-19 have one of the highest morbidity and mortality\nrates in the world. For physicians, the identification of lesions is difficult\nin the early stages of the disease and time-consuming. Therefore, multi-task\nlearning is an approach to extracting important features, such as lesions, from\nsmall amounts of medical data because it learns to generalize better. We\npropose a novel multi-task framework for classification, segmentation,\nreconstruction, and detection. To the best of our knowledge, we are the first\nones who added detection to the multi-task solution. Additionally, we checked\nthe possibility of using two different backbones and different loss functions\nin the segmentation task.\n","authors":["Weronika Hryniewska-Guzik","Maria Kędzierska","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.01137v1.pdf","comment":"presented at the Polish Conference on Artificial Intelligence\n  (PP-RAI), 2023"},{"id":"http://arxiv.org/abs/2306.10940v2","updated":"2023-08-02T13:04:50Z","published":"2023-06-19T14:00:34Z","title":"TeleViT: Teleconnection-driven Transformers Improve Subseasonal to\n  Seasonal Wildfire Forecasting","summary":"  Wildfires are increasingly exacerbated as a result of climate change,\nnecessitating advanced proactive measures for effective mitigation. It is\nimportant to forecast wildfires weeks and months in advance to plan forest fuel\nmanagement, resource procurement and allocation. To achieve such accurate\nlong-term forecasts at a global scale, it is crucial to employ models that\naccount for the Earth system's inherent spatio-temporal interactions, such as\nmemory effects and teleconnections. We propose a teleconnection-driven vision\ntransformer (TeleViT), capable of treating the Earth as one interconnected\nsystem, integrating fine-grained local-scale inputs with global-scale inputs,\nsuch as climate indices and coarse-grained global variables. Through\ncomprehensive experimentation, we demonstrate the superiority of TeleViT in\naccurately predicting global burned area patterns for various forecasting\nwindows, up to four months in advance. The gain is especially pronounced in\nlarger forecasting windows, demonstrating the improved ability of deep learning\nmodels that exploit teleconnections to capture Earth system dynamics. Code\navailable at https://github.com/Orion-Ai-Lab/TeleViT.\n","authors":["Ioannis Prapas","Nikolaos Ioannis Bountos","Spyros Kondylatos","Dimitrios Michail","Gustau Camps-Valls","Ioannis Papoutsis"],"pdf_url":"https://arxiv.org/pdf/2306.10940v2.pdf","comment":"Accepted at the ICCV 2023 workshop on Artificial Intelligence for\n  Humanitarian Assistance and Disaster Response"},{"id":"http://arxiv.org/abs/2308.01119v1","updated":"2023-08-02T12:59:10Z","published":"2023-08-02T12:59:10Z","title":"Unlearning Spurious Correlations in Chest X-ray Classification","summary":"  Medical image classification models are frequently trained using training\ndatasets derived from multiple data sources. While leveraging multiple data\nsources is crucial for achieving model generalization, it is important to\nacknowledge that the diverse nature of these sources inherently introduces\nunintended confounders and other challenges that can impact both model accuracy\nand transparency. A notable confounding factor in medical image classification,\nparticularly in musculoskeletal image classification, is skeletal\nmaturation-induced bone growth observed during adolescence. We train a deep\nlearning model using a Covid-19 chest X-ray dataset and we showcase how this\ndataset can lead to spurious correlations due to unintended confounding\nregions. eXplanation Based Learning (XBL) is a deep learning approach that goes\nbeyond interpretability by utilizing model explanations to interactively\nunlearn spurious correlations. This is achieved by integrating interactive user\nfeedback, specifically feature annotations. In our study, we employed two\nnon-demanding manual feedback mechanisms to implement an XBL-based approach for\neffectively eliminating these spurious correlations. Our results underscore the\npromising potential of XBL in constructing robust models even in the presence\nof confounding factors.\n","authors":["Misgina Tsighe Hagos","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2308.01119v1.pdf","comment":"Accepted at the Discovery Science 2023 conference. arXiv admin note:\n  text overlap with arXiv:2307.06026"},{"id":"http://arxiv.org/abs/2308.01118v1","updated":"2023-08-02T12:58:11Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":"  Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations today's\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and we review existing approaches to detect, quantify and\nmitigate popularity bias in recommender systems. Our survey therefore includes\nboth an overview of the computational metrics used in the literature as well as\na review of the main technical approaches to reduce the bias. We furthermore\ncritically discuss today's literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v1.pdf","comment":"Under review, submitted to UMUAI"},{"id":"http://arxiv.org/abs/2209.13964v3","updated":"2023-08-02T12:43:19Z","published":"2022-09-28T09:52:15Z","title":"Graph Soft-Contrastive Learning via Neighborhood Ranking","summary":"  Graph Contrastive Learning (GCL) has emerged as a promising approach in the\nrealm of graph self-supervised learning. Prevailing GCL methods mainly derive\nfrom the principles of contrastive learning in the field of computer vision:\nmodeling invariance by specifying absolutely similar pairs. However, when\napplied to graph data, this paradigm encounters two significant limitations:\n(1) the validity of the generated views cannot be guaranteed: graph\nperturbation may produce invalid views against semantics and intrinsic topology\nof graph data; (2) specifying absolutely similar pairs in the graph views is\nunreliable: for abstract and non-Euclidean graph data, it is difficult for\nhumans to decide the absolute similarity and dissimilarity intuitively. Despite\nthe notable performance of current GCL methods, these challenges necessitate a\nreevaluation: Could GCL be more effectively tailored to the intrinsic\nproperties of graphs, rather than merely adopting principles from computer\nvision? In response to this query, we propose a novel paradigm, Graph\nSoft-Contrastive Learning (GSCL). This approach facilitates GCL via\nneighborhood ranking, avoiding the need to specify absolutely similar pairs.\nGSCL leverages the underlying graph characteristic of diminishing label\nconsistency, asserting that nodes that are closer in the graph are overall more\nsimilar than far-distant nodes. Within the GSCL framework, we introduce\npairwise and listwise gated ranking InfoNCE loss functions to effectively\npreserve the relative similarity ranking within neighborhoods. Moreover, as the\nneighborhood size exponentially expands with more hops considered, we propose\nneighborhood sampling strategies to improve learning efficiency. Our extensive\nempirical results across 11 commonly used graph datasets-including 8 homophily\ngraphs and 3 heterophily graphs-demonstrate GSCL's superior performance\ncompared to 20 SOTA GCL methods.\n","authors":["Zhiyuan Ning","Pengfei Wang","Pengyang Wang","Ziyue Qiao","Wei Fan","Denghui Zhang","Yi Du","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2209.13964v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12653v2","updated":"2023-08-02T12:20:40Z","published":"2023-03-09T05:30:53Z","title":"Robust mmWave Beamforming by Self-Supervised Hybrid Deep Learning","summary":"  Beamforming with large-scale antenna arrays has been widely used in recent\nyears, which is acknowledged as an important part in 5G and incoming 6G. Thus,\nvarious techniques are leveraged to improve its performance, e.g., deep\nlearning, advanced optimization algorithms, etc. Although its performance in\nmany previous research scenarios with deep learning is quite attractive,\nusually it drops rapidly when the environment or dataset is changed. Therefore,\ndesigning effective beamforming network with strong robustness is an open issue\nfor the intelligent wireless communications. In this paper, we propose a robust\nbeamforming self-supervised network, and verify it in two kinds of different\ndatasets with various scenarios. Simulation results show that the proposed\nself-supervised network with hybrid learning performs well in both classic\nDeepMIMO and new WAIR-D dataset with the strong robustness under the various\nenvironments. Also, we present the principle to explain the rationality of this\nkind of hybrid learning, which is instructive to apply with more kinds of\ndatasets.\n","authors":["Fenghao Zhu","Bohao Wang","Zhaohui Yang","Chongwen Huang","Zhaoyang Zhang","George C. Alexandropoulos","Chau Yuen","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2303.12653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05150v2","updated":"2023-08-02T12:12:56Z","published":"2023-06-08T12:18:18Z","title":"Bayesian Optimization of Expensive Nested Grey-Box Functions","summary":"  We consider the problem of optimizing a grey-box objective function, i.e.,\nnested function composed of both black-box and white-box functions. A general\nformulation for such grey-box problems is given, which covers the existing\ngrey-box optimization formulations as special cases. We then design an\noptimism-driven algorithm to solve it. Under certain regularity assumptions,\nour algorithm achieves similar regret bound as that for the standard black-box\nBayesian optimization algorithm, up to a constant multiplicative term depending\non the Lipschitz constants of the functions considered. We further extend our\nmethod to the constrained case and discuss special cases. For the commonly used\nkernel functions, the regret bounds allow us to derive a convergence rate to\nthe optimal solution. Experimental results show that our grey-box optimization\nmethod empirically improves the speed of finding the global optimal solution\nsignificantly, as compared to the standard black-box optimization algorithm.\n","authors":["Wenjie Xu","Yuning Jiang","Bratislav Svetozarevic","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2306.05150v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01097v1","updated":"2023-08-02T12:04:28Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":"  Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04635v3","updated":"2023-08-02T11:42:11Z","published":"2022-10-10T12:35:02Z","title":"FaDIn: Fast Discretized Inference for Hawkes Processes with General\n  Parametric Kernels","summary":"  Temporal point processes (TPP) are a natural tool for modeling event-based\ndata. Among all TPP models, Hawkes processes have proven to be the most widely\nused, mainly due to their adequate modeling for various applications,\nparticularly when considering exponential or non-parametric kernels. Although\nnon-parametric kernels are an option, such models require large datasets. While\nexponential kernels are more data efficient and relevant for specific\napplications where events immediately trigger more events, they are ill-suited\nfor applications where latencies need to be estimated, such as in neuroscience.\nThis work aims to offer an efficient solution to TPP inference using general\nparametric kernels with finite support. The developed solution consists of a\nfast $\\ell_2$ gradient-based solver leveraging a discretized version of the\nevents. After theoretically supporting the use of discretization, the\nstatistical and computational efficiency of the novel approach is demonstrated\nthrough various numerical experiments. Finally, the method's effectiveness is\nevaluated by modeling the occurrence of stimuli-induced patterns from brain\nsignals recorded with magnetoencephalography (MEG). Given the use of general\nparametric kernels, results show that the proposed approach leads to an\nimproved estimation of pattern latency than the state-of-the-art.\n","authors":["Guillaume Staerman","Cédric Allain","Alexandre Gramfort","Thomas Moreau"],"pdf_url":"https://arxiv.org/pdf/2210.04635v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01086v1","updated":"2023-08-02T11:31:43Z","published":"2023-08-02T11:31:43Z","title":"Homography Estimation in Complex Topological Scenes","summary":"  Surveillance videos and images are used for a broad set of applications,\nranging from traffic analysis to crime detection. Extrinsic camera calibration\ndata is important for most analysis applications. However, security cameras are\nsusceptible to environmental conditions and small camera movements, resulting\nin a need for an automated re-calibration method that can account for these\nvarying conditions. In this paper, we present an automated camera-calibration\nprocess leveraging a dictionary-based approach that does not require prior\nknowledge on any camera settings. The method consists of a custom\nimplementation of a Spatial Transformer Network (STN) and a novel topological\nloss function. Experiments reveal that the proposed method improves the IoU\nmetric by up to 12% w.r.t. a state-of-the-art model across five synthetic\ndatasets and the World Cup 2014 dataset.\n","authors":["Giacomo D'Amicantonio","Egor Bondarau","Peter H. N. De With"],"pdf_url":"https://arxiv.org/pdf/2308.01086v1.pdf","comment":"Will be published in Intelligent Vehicle Symposium 2023"},{"id":"http://arxiv.org/abs/2308.01084v1","updated":"2023-08-02T11:26:33Z","published":"2023-08-02T11:26:33Z","title":"Data-Driven Identification of Quadratic Symplectic Representations of\n  Nonlinear Hamiltonian Systems","summary":"  We present a framework for learning Hamiltonian systems using data. This work\nis based on the lifting hypothesis, which posits that nonlinear Hamiltonian\nsystems can be written as nonlinear systems with cubic Hamiltonians. By\nleveraging this, we obtain quadratic dynamics that are Hamiltonian in a\ntransformed coordinate system. To that end, for given generalized position and\nmomentum data, we propose a methodology to learn quadratic dynamical systems,\nenforcing the Hamiltonian structure in combination with a symplectic\nauto-encoder. The enforced Hamiltonian structure exhibits long-term stability\nof the system, while the cubic Hamiltonian function provides relatively low\nmodel complexity. For low-dimensional data, we determine a higher-order\ntransformed coordinate system, whereas, for high-dimensional data, we find a\nlower-order coordinate system with the desired properties. We demonstrate the\nproposed methodology by means of both low-dimensional and high-dimensional\nnonlinear Hamiltonian systems.\n","authors":["Süleyman Yildiz","Pawan Goyal","Thomas Bendokat","Peter Benner"],"pdf_url":"https://arxiv.org/pdf/2308.01084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.13492v3","updated":"2023-08-02T11:02:52Z","published":"2022-05-26T17:02:43Z","title":"Sparse Graph Learning from Spatiotemporal Time Series","summary":"  Outstanding achievements of graph neural networks for spatiotemporal time\nseries analysis show that relational constraints introduce an effective\ninductive bias into neural forecasting architectures. Often, however, the\nrelational information characterizing the underlying data-generating process is\nunavailable and the practitioner is left with the problem of inferring from\ndata which relational graph to use in the subsequent processing stages. We\npropose novel, principled - yet practical - probabilistic score-based methods\nthat learn the relational dependencies as distributions over graphs while\nmaximizing end-to-end the performance at task. The proposed graph learning\nframework is based on consolidated variance reduction techniques for Monte\nCarlo score-based gradient estimation, is theoretically grounded, and, as we\nshow, effective in practice. In this paper, we focus on the time series\nforecasting problem and show that, by tailoring the gradient estimators to the\ngraph learning problem, we are able to achieve state-of-the-art performance\nwhile controlling the sparsity of the learned graph and the computational\nscalability. We empirically assess the effectiveness of the proposed method on\nsynthetic and real-world benchmarks, showing that the proposed solution can be\nused as a stand-alone graph identification procedure as well as a graph\nlearning component of an end-to-end forecasting architecture.\n","authors":["Andrea Cini","Daniele Zambon","Cesare Alippi"],"pdf_url":"https://arxiv.org/pdf/2205.13492v3.pdf","comment":"Accepted for publication in JMLR"},{"id":"http://arxiv.org/abs/2308.01074v1","updated":"2023-08-02T10:51:36Z","published":"2023-08-02T10:51:36Z","title":"A Practical Deep Learning-Based Acoustic Side Channel Attack on\n  Keyboards","summary":"  With recent developments in deep learning, the ubiquity of micro-phones and\nthe rise in online services via personal devices, acoustic side channel attacks\npresent a greater threat to keyboards than ever. This paper presents a\npractical implementation of a state-of-the-art deep learning model in order to\nclassify laptop keystrokes, using a smartphone integrated microphone. When\ntrained on keystrokes recorded by a nearby phone, the classifier achieved an\naccuracy of 95%, the highest accuracy seen without the use of a language model.\nWhen trained on keystrokes recorded using the video-conferencing software Zoom,\nan accuracy of 93% was achieved, a new best for the medium. Our results prove\nthe practicality of these side channel attacks via off-the-shelf equipment and\nalgorithms. We discuss a series of mitigation methods to protect users against\nthese series of attacks.\n","authors":["Joshua Harrison","Ehsan Toreini","Maryam Mehrnezhad"],"pdf_url":"https://arxiv.org/pdf/2308.01074v1.pdf","comment":"This paper was already accepted in 2023 IEEE European Symposium on\n  Security and Privacy Workshop, SiLM'23 (EuroS&PW)"},{"id":"http://arxiv.org/abs/2305.19569v3","updated":"2023-08-02T10:49:31Z","published":"2023-05-31T05:37:17Z","title":"Domain knowledge-informed Synthetic fault sample generation with Health\n  Data Map for cross-domain Planetary Gearbox Fault Diagnosis","summary":"  Extensive research has been conducted on fault diagnosis of planetary\ngearboxes using vibration signals and deep learning (DL) approaches. However,\nDL-based methods are susceptible to the domain shift problem caused by varying\noperating conditions of the gearbox. Although domain adaptation and data\nsynthesis methods have been proposed to overcome such domain shifts, they are\noften not directly applicable in real-world situations where only healthy data\nis available in the target domain. To tackle the challenge of extreme domain\nshift scenarios where only healthy data is available in the target domain, this\npaper proposes two novel domain knowledge-informed data synthesis methods\nutilizing the health data map (HDMap). The two proposed approaches are referred\nto as scaled CutPaste and FaultPaste. The HDMap is used to physically represent\nthe vibration signal of the planetary gearbox as an image-like matrix, allowing\nfor visualization of fault-related features. CutPaste and FaultPaste are then\napplied to generate faulty samples based on the healthy data in the target\ndomain, using domain knowledge and fault signatures extracted from the source\ndomain, respectively. In addition to generating realistic faults, the proposed\nmethods introduce scaling of fault signatures for controlled synthesis of\nfaults with various severity levels. A case study is conducted on a planetary\ngearbox testbed to evaluate the proposed approaches. The results show that the\nproposed methods are capable of accurately diagnosing faults, even in cases of\nextreme domain shift, and can estimate the severity of faults that have not\nbeen previously observed in the target domain.\n","authors":["Jong Moon Ha","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2305.19569v3.pdf","comment":"Under review / added arXiv identifier / Updated to revised version"},{"id":"http://arxiv.org/abs/2308.01071v1","updated":"2023-08-02T10:46:42Z","published":"2023-08-02T10:46:42Z","title":"Automatic Feature Engineering for Time Series Classification: Evaluation\n  and Discussion","summary":"  Time Series Classification (TSC) has received much attention in the past two\ndecades and is still a crucial and challenging problem in data science and\nknowledge engineering. Indeed, along with the increasing availability of time\nseries data, many TSC algorithms have been suggested by the research community\nin the literature. Besides state-of-the-art methods based on similarity\nmeasures, intervals, shapelets, dictionaries, deep learning methods or hybrid\nensemble methods, several tools for extracting unsupervised informative summary\nstatistics, aka features, from time series have been designed in the recent\nyears. Originally designed for descriptive analysis and visualization of time\nseries with informative and interpretable features, very few of these feature\nengineering tools have been benchmarked for TSC problems and compared with\nstate-of-the-art TSC algorithms in terms of predictive performance. In this\narticle, we aim at filling this gap and propose a simple TSC process to\nevaluate the potential predictive performance of the feature sets obtained with\nexisting feature engineering tools. Thus, we present an empirical study of 11\nfeature engineering tools branched with 9 supervised classifiers over 112 time\nseries data sets. The analysis of the results of more than 10000 learning\nexperiments indicate that feature-based methods perform as accurately as\ncurrent state-of-the-art TSC algorithms, and thus should rightfully be\nconsidered further in the TSC literature.\n","authors":["Aurélien Renault","Alexis Bondu","Vincent Lemaire","Dominique Gay"],"pdf_url":"https://arxiv.org/pdf/2308.01071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01070v1","updated":"2023-08-02T10:37:25Z","published":"2023-08-02T10:37:25Z","title":"When Analytic Calculus Cracks AdaBoost Code","summary":"  The principle of boosting in supervised learning involves combining multiple\nweak classifiers to obtain a stronger classifier. AdaBoost has the reputation\nto be a perfect example of this approach. We have previously shown that\nAdaBoost is not truly an optimization algorithm. This paper shows that AdaBoost\nis an algorithm in name only, as the resulting combination of weak classifiers\ncan be explicitly calculated using a truth table. This study is carried out by\nconsidering a problem with two classes and is illustrated by the particular\ncase of three binary classifiers and presents results in comparison with those\nfrom the implementation of AdaBoost algorithm of the Python library\nscikit-learn.\n","authors":["Jean-Marc Brossier","Olivier Lafitte","Lenny Réthoré"],"pdf_url":"https://arxiv.org/pdf/2308.01070v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.01063v1","updated":"2023-08-02T10:22:04Z","published":"2023-08-02T10:22:04Z","title":"Graph Anomaly Detection at Group Level: A Topology Pattern Enhanced\n  Unsupervised Approach","summary":"  Graph anomaly detection (GAD) has achieved success and has been widely\napplied in various domains, such as fraud detection, cybersecurity, finance\nsecurity, and biochemistry. However, existing graph anomaly detection\nalgorithms focus on distinguishing individual entities (nodes or graphs) and\noverlook the possibility of anomalous groups within the graph. To address this\nlimitation, this paper introduces a novel unsupervised framework for a new task\ncalled Group-level Graph Anomaly Detection (Gr-GAD). The proposed framework\nfirst employs a variant of Graph AutoEncoder (GAE) to locate anchor nodes that\nbelong to potential anomaly groups by capturing long-range inconsistencies.\nSubsequently, group sampling is employed to sample candidate groups, which are\nthen fed into the proposed Topology Pattern-based Graph Contrastive Learning\n(TPGCL) method. TPGCL utilizes the topology patterns of groups as clues to\ngenerate embeddings for each candidate group and thus distinct anomaly groups.\nThe experimental results on both real-world and synthetic datasets demonstrate\nthat the proposed framework shows superior performance in identifying and\nlocalizing anomaly groups, highlighting it as a promising solution for Gr-GAD.\nDatasets and codes of the proposed framework are at the github repository\nhttps://anonymous.4open.science/r/Topology-Pattern-Enhanced-Unsupervised-Group-level-Graph-Anomaly-Detection.\n","authors":["Xing Ai","Jialong Zhou","Yulin Zhu","Gaolei Li","Tomasz P. Michalak","Xiapu Luo","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.01063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01054v1","updated":"2023-08-02T10:02:38Z","published":"2023-08-02T10:02:38Z","title":"Simulation-based inference using surjective sequential neural likelihood\n  estimation","summary":"  We present Surjective Sequential Neural Likelihood (SSNL) estimation, a novel\nmethod for simulation-based inference in models where the evaluation of the\nlikelihood function is not tractable and only a simulator that can generate\nsynthetic data is available. SSNL fits a dimensionality-reducing surjective\nnormalizing flow model and uses it as a surrogate likelihood function which\nallows for conventional Bayesian inference using either Markov chain Monte\nCarlo methods or variational inference. By embedding the data in a\nlow-dimensional space, SSNL solves several issues previous likelihood-based\nmethods had when applied to high-dimensional data sets that, for instance,\ncontain non-informative data dimensions or lie along a lower-dimensional\nmanifold. We evaluate SSNL on a wide variety of experiments and show that it\ngenerally outperforms contemporary methods used in simulation-based inference,\nfor instance, on a challenging real-world example from astrophysics which\nmodels the magnetic field strength of the sun using a solar dynamo model.\n","authors":["Simon Dirmeier","Carlo Albert","Fernando Perez-Cruz"],"pdf_url":"https://arxiv.org/pdf/2308.01054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01050v1","updated":"2023-08-02T09:48:08Z","published":"2023-08-02T09:48:08Z","title":"A Counterfactual Safety Margin Perspective on the Scoring of Autonomous\n  Vehicles' Riskiness","summary":"  Autonomous Vehicles (AVs) have the potential to provide numerous societal\nbenefits, such as decreased road accidents and increased overall transportation\nefficiency. However, quantifying the risk associated with AVs is challenging\ndue to the lack of historical data and the rapidly evolving technology. This\npaper presents a data-driven framework for comparing the risk of different AVs'\nbehaviors in various operational design domains (ODDs), based on counterfactual\nsimulations of \"misbehaving\" road users. We introduce the concept of\ncounterfactual safety margin, which represents the minimum deviation from\nnormal behavior that could lead to a collision. This concept helps to find the\nmost critical scenarios but also to assess the frequency and severity of risk\nof AVs. We show that the proposed methodology is applicable even when the AV's\nbehavioral policy is unknown -- through worst- and best-case analyses -- making\nthe method useful also to external third-party risk assessors. Our experimental\nresults demonstrate the correlation between the safety margin, the driving\npolicy quality, and the ODD shedding light on the relative risk associated with\ndifferent AV providers. This work contributes to AV safety assessment and aids\nin addressing legislative and insurance concerns surrounding this emerging\ntechnology.\n","authors":["Alessandro Zanardi","Andrea Censi","Margherita Atzei","Luigi Di Lillo","Emilio Frazzoli"],"pdf_url":"https://arxiv.org/pdf/2308.01050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16132v3","updated":"2023-08-02T09:42:46Z","published":"2023-03-28T16:56:47Z","title":"Transformer and Snowball Graph Convolution Learning for Brain functional\n  network Classification","summary":"  Advanced deep learning methods, especially graph neural networks (GNNs), are\nincreasingly expected to learn from brain functional network data and predict\nbrain disorders. In this paper, we proposed a novel Transformer and snowball\nencoding networks (TSEN) for brain functional network classification, which\nintroduced Transformer architecture with graph snowball connection into GNNs\nfor learning whole-graph representation. TSEN combined graph snowball\nconnection with graph Transformer by snowball encoding layers, which enhanced\nthe power to capture multi-scale information and global patterns of brain\nfunctional networks. TSEN also introduced snowball graph convolution as\nposition embedding in Transformer structure, which was a simple yet effective\nmethod for capturing local patterns naturally. We evaluated the proposed model\nby two large-scale brain functional network datasets from autism spectrum\ndisorder and major depressive disorder respectively, and the results\ndemonstrated that TSEN outperformed the state-of-the-art GNN models and the\ngraph-transformer based GNN models.\n","authors":["Jinlong Hu","Yangmin Huang","Shoubin Dong"],"pdf_url":"https://arxiv.org/pdf/2303.16132v3.pdf","comment":"Prepared to submit"},{"id":"http://arxiv.org/abs/2305.01666v4","updated":"2023-08-02T09:37:14Z","published":"2023-05-02T13:01:59Z","title":"BrainNPT: Pre-training of Transformer networks for brain network\n  classification","summary":"  Deep learning methods have advanced quickly in brain imaging analysis over\nthe past few years, but they are usually restricted by the limited labeled\ndata. Pre-trained model on unlabeled data has presented promising improvement\nin feature learning in many domains, including natural language processing and\ncomputer vision. However, this technique is under-explored in brain network\nanalysis. In this paper, we focused on pre-training methods with Transformer\nnetworks to leverage existing unlabeled data for brain functional network\nclassification. First, we proposed a Transformer-based neural network, named as\nBrainNPT, for brain functional network classification. The proposed method\nleveraged <cls> token as a classification embedding vector for the Transformer\nmodel to effectively capture the representation of brain network. Second, we\nproposed a pre-training framework for BrainNPT model to leverage unlabeled\nbrain network data to learn the structure information of brain networks. The\nresults of classification experiments demonstrated the BrainNPT model without\npre-training achieved the best performance with the state-of-the-art models,\nand the BrainNPT model with pre-training strongly outperformed the\nstate-of-the-art models. The pre-training BrainNPT model improved 8.75% of\naccuracy compared with the model without pre-training. We further compared the\npre-training strategies, analyzed the influence of the parameters of the model,\nand interpreted the trained model.\n","authors":["Jinlong Hu","Yangmin Huang","Nan Wang","Shoubin Dong"],"pdf_url":"https://arxiv.org/pdf/2305.01666v4.pdf","comment":"Prepared to Submit"},{"id":"http://arxiv.org/abs/2308.01039v1","updated":"2023-08-02T09:30:22Z","published":"2023-08-02T09:30:22Z","title":"Computing the Distance between unbalanced Distributions -- The flat\n  Metric","summary":"  We provide an implementation to compute the flat metric in any dimension. The\nflat metric, also called dual bounded Lipschitz distance, generalizes the\nwell-known Wasserstein distance W1 to the case that the distributions are of\nunequal total mass. This is of particular interest for unbalanced optimal\ntransport tasks and for the analysis of data distributions where the sample\nsize is important or normalization is not possible. The core of the method is\nbased on a neural network to determine on optimal test function realizing the\ndistance between two given measures. Special focus was put on achieving\ncomparability of pairwise computed distances from independently trained\nnetworks. We tested the quality of the output in several experiments where\nground truth was available as well as with simulated data.\n","authors":["Henri Schmidt","Christian Düll"],"pdf_url":"https://arxiv.org/pdf/2308.01039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01030v1","updated":"2023-08-02T09:27:11Z","published":"2023-08-02T09:27:11Z","title":"Three Factors to Improve Out-of-Distribution Detection","summary":"  In the problem of out-of-distribution (OOD) detection, the usage of auxiliary\ndata as outlier data for fine-tuning has demonstrated encouraging performance.\nHowever, previous methods have suffered from a trade-off between classification\naccuracy (ACC) and OOD detection performance (AUROC, FPR, AUPR). To improve\nthis trade-off, we make three contributions: (i) Incorporating a self-knowledge\ndistillation loss can enhance the accuracy of the network; (ii) Sampling\nsemi-hard outlier data for training can improve OOD detection performance with\nminimal impact on accuracy; (iii) The introduction of our novel supervised\ncontrastive learning can simultaneously improve OOD detection performance and\nthe accuracy of the network. By incorporating all three factors, our approach\nenhances both accuracy and OOD detection performance by addressing the\ntrade-off between classification and OOD detection. Our method achieves\nimprovements over previous approaches in both performance metrics.\n","authors":["Hyunjun Choi","JaeHo Chung","Hawook Jeong","Jin Young Choi"],"pdf_url":"https://arxiv.org/pdf/2308.01030v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2212.05206v2","updated":"2023-08-02T09:24:23Z","published":"2022-12-10T05:07:30Z","title":"Thinking Fast and Slow in Large Language Models","summary":"  Large language models (LLMs) are currently at the forefront of intertwining\nAI systems with human communication and everyday life. Therefore, it is of\ngreat importance to evaluate their emerging abilities. In this study, we show\nthat LLMs like GPT-3 exhibit behavior that strikingly resembles human-like\nintuition - and the cognitive errors that come with it. However, LLMs with\nhigher cognitive capabilities, in particular ChatGPT and GPT-4, learned to\navoid succumbing to these errors and perform in a hyperrational manner. For our\nexperiments, we probe LLMs with the Cognitive Reflection Test (CRT) as well as\nsemantic illusions that were originally designed to investigate intuitive\ndecision-making in humans. Our study demonstrates that investigating LLMs with\nmethods from psychology has the potential to reveal otherwise unknown emergent\ntraits.\n","authors":["Thilo Hagendorff","Sarah Fabi","Michal Kosinski"],"pdf_url":"https://arxiv.org/pdf/2212.05206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01028v1","updated":"2023-08-02T09:23:16Z","published":"2023-08-02T09:23:16Z","title":"Maximizing Success Rate of Payment Routing using Non-stationary Bandits","summary":"  This paper discusses the system architecture design and deployment of\nnon-stationary multi-armed bandit approaches to determine a near-optimal\npayment routing policy based on the recent history of transactions. We propose\na Routing Service architecture using a novel Ray-based implementation for\noptimally scaling bandit-based payment routing to over 10000 transactions per\nsecond, adhering to the system design requirements and ecosystem constraints\nwith Payment Card Industry Data Security Standard (PCI DSS). We first evaluate\nthe effectiveness of multiple bandit-based payment routing algorithms on a\ncustom simulator to benchmark multiple non-stationary bandit approaches and\nidentify the best hyperparameters. We then conducted live experiments on the\npayment transaction system on a fantasy sports platform Dream11. In the live\nexperiments, we demonstrated that our non-stationary bandit-based algorithm\nconsistently improves the success rate of transactions by 0.92\\% compared to\nthe traditional rule-based methods over one month.\n","authors":["Aayush Chaudhary","Abhinav Rai","Abhishek Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.01028v1.pdf","comment":"7 Pages, 6 Figures"},{"id":"http://arxiv.org/abs/2110.15701v4","updated":"2023-08-02T09:14:54Z","published":"2021-10-29T12:01:48Z","title":"Successor Feature Representations","summary":"  Transfer in Reinforcement Learning aims to improve learning performance on\ntarget tasks using knowledge from experienced source tasks. Successor\nRepresentations (SR) and their extension Successor Features (SF) are prominent\ntransfer mechanisms in domains where reward functions change between tasks.\nThey reevaluate the expected return of previously learned policies in a new\ntarget task to transfer their knowledge. The SF framework extended SR by\nlinearly decomposing rewards into successor features and a reward weight vector\nallowing their application in high-dimensional tasks. But this came with the\ncost of having a linear relationship between reward functions and successor\nfeatures, limiting its application to tasks where such a linear relationship\nexists. We propose a novel formulation of SR based on learning the cumulative\ndiscounted probability of successor features, called Successor Feature\nRepresentations (SFR). Crucially, SFR allows to reevaluate the expected return\nof policies for general reward functions. We introduce different SFR\nvariations, prove its convergence, and provide a guarantee on its transfer\nperformance. Experimental evaluations based on SFR with function approximation\ndemonstrate its advantage over SF not only for general reward functions, but\nalso in the case of linearly decomposable reward functions.\n","authors":["Chris Reinke","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2110.15701v4.pdf","comment":"published in Transactions on Machine Learning Research (05/2023),\n  source code: https://gitlab.inria.fr/robotlearn/sfr_learning, [v2] added\n  experiments with learned features, [v3] renamed paper and changed scope, [v4]\n  published version"},{"id":"http://arxiv.org/abs/2211.08043v2","updated":"2023-08-02T09:12:54Z","published":"2022-11-15T10:49:04Z","title":"The rate of convergence of Bregman proximal methods: Local geometry vs.\n  regularity vs. sharpness","summary":"  We examine the last-iterate convergence rate of Bregman proximal methods -\nfrom mirror descent to mirror-prox and its optimistic variants - as a function\nof the local geometry induced by the prox-mapping defining the method. For\ngenerality, we focus on local solutions of constrained, non-monotone\nvariational inequalities, and we show that the convergence rate of a given\nmethod depends sharply on its associated Legendre exponent, a notion that\nmeasures the growth rate of the underlying Bregman function (Euclidean,\nentropic, or other) near a solution. In particular, we show that boundary\nsolutions exhibit a stark separation of regimes between methods with a zero and\nnon-zero Legendre exponent: the former converge at a linear rate, while the\nlatter converge, in general, sublinearly. This dichotomy becomes even more\npronounced in linearly constrained problems where methods with entropic\nregularization achieve a linear convergence rate along sharp directions,\ncompared to convergence in a finite number of steps under Euclidean\nregularization.\n","authors":["Waïss Azizian","Franck Iutzeler","Jérôme Malick","Panayotis Mertikopoulos"],"pdf_url":"https://arxiv.org/pdf/2211.08043v2.pdf","comment":"31 pages, 2 tables, 2 figures"},{"id":"http://arxiv.org/abs/2210.07420v3","updated":"2023-08-02T08:53:43Z","published":"2022-10-13T23:51:22Z","title":"Learning to Efficiently Plan Robust Frictional Multi-Object Grasps","summary":"  We consider a decluttering problem where multiple rigid convex polygonal\nobjects rest in randomly placed positions and orientations on a planar surface\nand must be efficiently transported to a packing box using both single and\nmulti-object grasps. Prior work considered frictionless multi-object grasping.\nIn this paper, we introduce friction to increase the number of potential grasps\nfor a given group of objects, and thus increase picks per hour. We train a\nneural network using real examples to plan robust multi-object grasps. In\nphysical experiments, we find a 13.7% increase in success rate, a 1.6x increase\nin picks per hour, and a 6.3x decrease in grasp planning time compared to prior\nwork on multi-object grasping. Compared to single-object grasping, we find a\n3.1x increase in picks per hour.\n","authors":["Wisdom C. Agboh","Satvik Sharma","Kishore Srinivas","Mallika Parulekar","Gaurav Datta","Tianshuang Qiu","Jeffrey Ichnowski","Eugen Solowjow","Mehmet Dogar","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2210.07420v3.pdf","comment":"IEEE IROS 2023"},{"id":"http://arxiv.org/abs/2308.00436v2","updated":"2023-08-02T08:45:40Z","published":"2023-08-01T10:31:36Z","title":"SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step\n  Reasoning","summary":"  The recent progress in large language models (LLMs), especially the invention\nof chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning\nproblems. However, even the strongest LLMs are still struggling with more\ncomplicated problems that require non-linear thinking and multi-step reasoning.\nIn this work, we explore whether LLMs have the ability to recognize their own\nerrors, without resorting to external resources. In particular, we investigate\nwhether they can be used to identify individual errors within a step-by-step\nreasoning. To this end, we propose a zero-shot verification scheme to recognize\nsuch errors. We then use this verification scheme to improve question-answering\nperformance, by using it to perform weighted voting on different generated\nanswers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and\nfind that it successfully recognizes errors and, in turn, increases final\npredictive performance.\n","authors":["Ning Miao","Yee Whye Teh","Tom Rainforth"],"pdf_url":"https://arxiv.org/pdf/2308.00436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01011v1","updated":"2023-08-02T08:37:45Z","published":"2023-08-02T08:37:45Z","title":"Enhancing Representation Learning for Periodic Time Series with Floss: A\n  Frequency Domain Regularization Approach","summary":"  Time series analysis is a fundamental task in various application domains,\nand deep learning approaches have demonstrated remarkable performance in this\narea. However, many real-world time series data exhibit significant periodic or\nquasi-periodic dynamics that are often not adequately captured by existing deep\nlearning-based solutions. This results in an incomplete representation of the\nunderlying dynamic behaviors of interest. To address this gap, we propose an\nunsupervised method called Floss that automatically regularizes learned\nrepresentations in the frequency domain. The Floss method first automatically\ndetects major periodicities from the time series. It then employs periodic\nshift and spectral density similarity measures to learn meaningful\nrepresentations with periodic consistency. In addition, Floss can be easily\nincorporated into both supervised, semi-supervised, and unsupervised learning\nframeworks. We conduct extensive experiments on common time series\nclassification, forecasting, and anomaly detection tasks to demonstrate the\neffectiveness of Floss. We incorporate Floss into several representative deep\nlearning solutions to justify our design choices and demonstrate that it is\ncapable of automatically discovering periodic dynamics and improving\nstate-of-the-art deep learning models.\n","authors":["Chunwei Yang","Xiaoxu Chen","Lijun Sun","Hongyu Yang","Yuankai Wu"],"pdf_url":"https://arxiv.org/pdf/2308.01011v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2207.10276v3","updated":"2023-08-02T08:32:57Z","published":"2022-07-21T03:01:04Z","title":"ProMix: Combating Label Noise via Maximizing Clean Sample Utility","summary":"  Learning with Noisy Labels (LNL) has become an appealing topic, as\nimperfectly annotated data are relatively cheaper to obtain. Recent\nstate-of-the-art approaches employ specific selection mechanisms to separate\nclean and noisy samples and then apply Semi-Supervised Learning (SSL)\ntechniques for improved performance. However, the selection step mostly\nprovides a medium-sized and decent-enough clean subset, which overlooks a rich\nset of clean samples. To fulfill this, we propose a novel LNL framework ProMix\nthat attempts to maximize the utility of clean samples for boosted performance.\nKey to our method, we propose a matched high confidence selection technique\nthat selects those examples with high confidence scores and matched predictions\nwith given labels to dynamically expand a base clean sample set. To overcome\nthe potential side effect of excessive clean set selection procedure, we\nfurther devise a novel SSL framework that is able to train balanced and\nunbiased classifiers on the separated clean and noisy samples. Extensive\nexperiments demonstrate that ProMix significantly advances the current\nstate-of-the-art results on multiple benchmarks with different types and levels\nof noise. It achieves an average improvement of 2.48\\% on the CIFAR-N dataset.\nThe code is available at https://github.com/Justherozen/ProMix\n","authors":["Ruixuan Xiao","Yiwen Dong","Haobo Wang","Lei Feng","Runze Wu","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2207.10276v3.pdf","comment":"Accepted in IJCAI 2023"},{"id":"http://arxiv.org/abs/2308.01000v1","updated":"2023-08-02T08:20:00Z","published":"2023-08-02T08:20:00Z","title":"MDT3D: Multi-Dataset Training for LiDAR 3D Object Detection\n  Generalization","summary":"  Supervised 3D Object Detection models have been displaying increasingly\nbetter performance in single-domain cases where the training data comes from\nthe same environment and sensor as the testing data. However, in real-world\nscenarios data from the target domain may not be available for finetuning or\nfor domain adaptation methods. Indeed, 3D object detection models trained on a\nsource dataset with a specific point distribution have shown difficulties in\ngeneralizing to unseen datasets. Therefore, we decided to leverage the\ninformation available from several annotated source datasets with our\nMulti-Dataset Training for 3D Object Detection (MDT3D) method to increase the\nrobustness of 3D object detection models when tested in a new environment with\na different sensor configuration. To tackle the labelling gap between datasets,\nwe used a new label mapping based on coarse labels. Furthermore, we show how we\nmanaged the mix of datasets during training and finally introduce a new\ncross-dataset augmentation method: cross-dataset object injection. We\ndemonstrate that this training paradigm shows improvements for different types\nof 3D object detection models. The source code and additional results for this\nresearch project will be publicly available on GitHub for interested parties to\naccess and utilize: https://github.com/LouisSF/MDT3D\n","authors":["Louis Soum-Fontez","Jean-Emmanuel Deschaud","François Goulette"],"pdf_url":"https://arxiv.org/pdf/2308.01000v1.pdf","comment":"Accepted for publication at IROS 2023"},{"id":"http://arxiv.org/abs/2308.00994v1","updated":"2023-08-02T07:59:25Z","published":"2023-08-02T07:59:25Z","title":"Exploiting Synthetic Data for Data Imbalance Problems: Baselines from a\n  Data Perspective","summary":"  We live in a vast ocean of data, and deep neural networks are no exception to\nthis. However, this data exhibits an inherent phenomenon of imbalance. This\nimbalance poses a risk of deep neural networks producing biased predictions,\nleading to potentially severe ethical and social consequences. To address these\nchallenges, we believe that the use of generative models is a promising\napproach for comprehending tasks, given the remarkable advancements\ndemonstrated by recent diffusion models in generating high-quality images. In\nthis work, we propose a simple yet effective baseline, SYNAuG, that utilizes\nsynthetic data as a preliminary step before employing task-specific algorithms\nto address data imbalance problems. This straightforward approach yields\nimpressive performance on datasets such as CIFAR100-LT, ImageNet100-LT,\nUTKFace, and Waterbird, surpassing the performance of existing task-specific\nmethods. While we do not claim that our approach serves as a complete solution\nto the problem of data imbalance, we argue that supplementing the existing data\nwith synthetic data proves to be an effective and crucial preliminary step in\naddressing data imbalance concerns.\n","authors":["Moon Ye-Bin","Nam Hyeon-Woo","Wonseok Choi","Nayeong Kim","Suha Kwak","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2308.00994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.10401v2","updated":"2023-08-02T07:58:00Z","published":"2021-04-21T08:13:17Z","title":"Multi-Attention-Based Soft Partition Network for Vehicle\n  Re-Identification","summary":"  Vehicle re-identification helps in distinguishing between images of the same\nand other vehicles. It is a challenging process because of significant\nintra-instance differences between identical vehicles from different views and\nsubtle inter-instance differences between similar vehicles. To solve this\nissue, researchers have extracted view-aware or part-specific features via\nspatial attention mechanisms, which usually result in noisy attention maps or\notherwise require expensive additional annotation for metadata, such as key\npoints, to improve the quality. Meanwhile, based on the researchers' insights,\nvarious handcrafted multi-attention architectures for specific viewpoints or\nvehicle parts have been proposed. However, this approach does not guarantee\nthat the number and nature of attention branches will be optimal for real-world\nre-identification tasks. To address these problems, we proposed a new vehicle\nre-identification network based on a multiple soft attention mechanism for\ncapturing various discriminative regions from different viewpoints more\nefficiently. Furthermore, this model can significantly reduce the noise in\nspatial attention maps by devising a new method for creating an attention map\nfor insignificant regions and then excluding it from generating the final\nresult. We also combined a channel-wise attention mechanism with a spatial\nattention mechanism for the efficient selection of important semantic\nattributes for vehicle re-identification. Our experiments showed that our\nproposed model achieved a state-of-the-art performance among the\nattention-based methods without metadata and was comparable to the approaches\nusing metadata for the VehicleID and VERI-Wild datasets.\n","authors":["Sangrok Lee","Taekang Woo","Sang Hun Lee"],"pdf_url":"https://arxiv.org/pdf/2104.10401v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.16648v2","updated":"2023-08-02T07:47:26Z","published":"2023-07-31T13:27:21Z","title":"LLMs4OL: Large Language Models for Ontology Learning","summary":"  We propose the LLMs4OL approach, which utilizes Large Language Models (LLMs)\nfor Ontology Learning (OL). LLMs have shown significant advancements in natural\nlanguage processing, demonstrating their ability to capture complex language\npatterns in different knowledge domains. Our LLMs4OL paradigm investigates the\nfollowing hypothesis: \\textit{Can LLMs effectively apply their language pattern\ncapturing capability to OL, which involves automatically extracting and\nstructuring knowledge from natural language text?} To test this hypothesis, we\nconduct a comprehensive evaluation using the zero-shot prompting method. We\nevaluate nine different LLM model families for three main OL tasks: term\ntyping, taxonomy discovery, and extraction of non-taxonomic relations.\nAdditionally, the evaluations encompass diverse genres of ontological\nknowledge, including lexicosemantic knowledge in WordNet, geographical\nknowledge in GeoNames, and medical knowledge in UMLS.\n","authors":["Hamed Babaei Giglou","Jennifer D'Souza","Sören Auer"],"pdf_url":"https://arxiv.org/pdf/2307.16648v2.pdf","comment":"15 pages main content, 27 pages overall, 2 Figures, accepted for\n  publication at ISWC 2023 research track"},{"id":"http://arxiv.org/abs/2308.00989v1","updated":"2023-08-02T07:45:24Z","published":"2023-08-02T07:45:24Z","title":"Wasserstein Diversity-Enriched Regularizer for Hierarchical\n  Reinforcement Learning","summary":"  Hierarchical reinforcement learning composites subpolicies in different\nhierarchies to accomplish complex tasks.Automated subpolicies discovery, which\ndoes not depend on domain knowledge, is a promising approach to generating\nsubpolicies.However, the degradation problem is a challenge that existing\nmethods can hardly deal with due to the lack of consideration of diversity or\nthe employment of weak regularizers. In this paper, we propose a novel\ntask-agnostic regularizer called the Wasserstein Diversity-Enriched Regularizer\n(WDER), which enlarges the diversity of subpolicies by maximizing the\nWasserstein distances among action distributions. The proposed WDER can be\neasily incorporated into the loss function of existing methods to boost their\nperformance further.Experimental results demonstrate that our WDER improves\nperformance and sample efficiency in comparison with prior work without\nmodifying hyperparameters, which indicates the applicability and robustness of\nthe WDER.\n","authors":["Haorui Li","Jiaqi Liang","Linjing Li","Daniel Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.00989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01482v3","updated":"2023-08-02T07:39:53Z","published":"2023-07-04T05:19:19Z","title":"Nexus sine qua non: Essentially Connected Networks for Traffic\n  Forecasting","summary":"  Spatial-temporal graph neural networks (STGNNs) have become the de facto\nmodels for learning spatiotemporal representations of traffic flow. However,\nmodern STGNNs often contain superfluous or obscure components, along with\ncomplex techniques, posing significant challenges in terms of complexity and\nscalability. Such concerns prompt us to rethink the design of neural\narchitectures and to identify the key challenges in traffic forecasting as\nspatial-temporal contextualization. Here, we present an essentially connected\nmodel based on an efficient message-passing backbone, powered by learnable node\nembedding, without any complex sequential techniques such as TCNs, RNNs, and\nTransformers. Intriguingly, empirical results demonstrate how a simple and\nelegant model with contextualization capability compares favorably w.r.t. the\nstate-of-the-art with elaborate structures, while being much more interpretable\nand computationally efficient for traffic forecasting. We anticipate that our\nfindings will open new horizons for further research to explore the possibility\nof creating simple but effective neural forecasting architectures.\n","authors":["Tong Nie","Guoyang Qin","Lijun Sun","Yunpeng Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01482v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04052v3","updated":"2023-08-02T07:36:09Z","published":"2022-10-08T15:23:30Z","title":"FedDef: Defense Against Gradient Leakage in Federated Learning-based\n  Network Intrusion Detection Systems","summary":"  Deep learning (DL) methods have been widely applied to anomaly-based network\nintrusion detection system (NIDS) to detect malicious traffic. To expand the\nusage scenarios of DL-based methods, federated learning (FL) allows multiple\nusers to train a global model on the basis of respecting individual data\nprivacy. However, it has not yet been systematically evaluated how robust\nFL-based NIDSs are against existing privacy attacks under existing defenses. To\naddress this issue, we propose two privacy evaluation metrics designed for\nFL-based NIDSs, including (1) privacy score that evaluates the similarity\nbetween the original and recovered traffic features using reconstruction\nattacks, and (2) evasion rate against NIDSs using adversarial attack with the\nrecovered traffic. We conduct experiments to illustrate that existing defenses\nprovide little protection and the corresponding adversarial traffic can even\nevade the SOTA NIDS Kitsune. To defend against such attacks and build a more\nrobust FL-based NIDS, we further propose FedDef, a novel optimization-based\ninput perturbation defense strategy with theoretical guarantee. It achieves\nboth high utility by minimizing the gradient distance and strong privacy\nprotection by maximizing the input distance. We experimentally evaluate four\nexisting defenses on four datasets and show that our defense outperforms all\nthe baselines in terms of privacy protection with up to 7 times higher privacy\nscore, while maintaining model accuracy loss within 3% under optimal parameter\ncombination.\n","authors":["Jiahui Chen","Yi Zhao","Qi Li","Xuewei Feng","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2210.04052v3.pdf","comment":"Accepted to TIFS'23, volume 18"},{"id":"http://arxiv.org/abs/2307.03393v3","updated":"2023-08-02T07:33:22Z","published":"2023-07-07T05:31:31Z","title":"Exploring the Potential of Large Language Models (LLMs) in Learning on\n  Graphs","summary":"  Learning on Graphs has attracted immense attention due to its wide real-world\napplications. The most popular pipeline for learning on graphs with textual\nnode attributes primarily relies on Graph Neural Networks (GNNs), and utilizes\nshallow text embedding as initial node representations, which has limitations\nin general knowledge and profound semantic understanding. In recent years,\nLarge Language Models (LLMs) have been proven to possess extensive common\nknowledge and powerful semantic comprehension abilities that have\nrevolutionized existing workflows to handle text data. In this paper, we aim to\nexplore the potential of LLMs in graph machine learning, especially the node\nclassification task, and investigate two possible pipelines: LLMs-as-Enhancers\nand LLMs-as-Predictors. The former leverages LLMs to enhance nodes' text\nattributes with their massive knowledge and then generate predictions through\nGNNs. The latter attempts to directly employ LLMs as standalone predictors. We\nconduct comprehensive and systematical studies on these two pipelines under\nvarious settings. From comprehensive empirical results, we make original\nobservations and find new insights that open new possibilities and suggest\npromising directions to leverage LLMs for learning on graphs. Our codes and\ndatasets are available at https://github.com/CurryTang/Graph-LLM.\n","authors":["Zhikai Chen","Haitao Mao","Hang Li","Wei Jin","Hongzhi Wen","Xiaochi Wei","Shuaiqiang Wang","Dawei Yin","Wenqi Fan","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2307.03393v3.pdf","comment":"add code"},{"id":"http://arxiv.org/abs/2308.00978v1","updated":"2023-08-02T07:20:37Z","published":"2023-08-02T07:20:37Z","title":"Certified Multi-Fidelity Zeroth-Order Optimization","summary":"  We consider the problem of multi-fidelity zeroth-order optimization, where\none can evaluate a function $f$ at various approximation levels (of varying\ncosts), and the goal is to optimize $f$ with the cheapest evaluations possible.\nIn this paper, we study \\emph{certified} algorithms, which are additionally\nrequired to output a data-driven upper bound on the optimization error. We\nfirst formalize the problem in terms of a min-max game between an algorithm and\nan evaluation environment. We then propose a certified variant of the MFDOO\nalgorithm and derive a bound on its cost complexity for any Lipschitz function\n$f$. We also prove an $f$-dependent lower bound showing that this algorithm has\na near-optimal cost complexity. We close the paper by addressing the special\ncase of noisy (stochastic) evaluations as a direct example.\n","authors":["Étienne de Montbrun","Sébastien Gerchinovitz"],"pdf_url":"https://arxiv.org/pdf/2308.00978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00963v1","updated":"2023-08-02T06:31:41Z","published":"2023-08-02T06:31:41Z","title":"Integrating Homomorphic Encryption and Trusted Execution Technology for\n  Autonomous and Confidential Model Refining in Cloud","summary":"  With the popularity of cloud computing and machine learning, it has been a\ntrend to outsource machine learning processes (including model training and\nmodel-based inference) to cloud. By the outsourcing, other than utilizing the\nextensive and scalable resource offered by the cloud service provider, it will\nalso be attractive to users if the cloud servers can manage the machine\nlearning processes autonomously on behalf of the users. Such a feature will be\nespecially salient when the machine learning is expected to be a long-term\ncontinuous process and the users are not always available to participate. Due\nto security and privacy concerns, it is also desired that the autonomous\nlearning preserves the confidentiality of users' data and models involved.\nHence, in this paper, we aim to design a scheme that enables autonomous and\nconfidential model refining in cloud. Homomorphic encryption and trusted\nexecution environment technology can protect confidentiality for autonomous\ncomputation, but each of them has their limitations respectively and they are\ncomplementary to each other. Therefore, we further propose to integrate these\ntwo techniques in the design of the model refining scheme. Through\nimplementation and experiments, we evaluate the feasibility of our proposed\nscheme. The results indicate that, with our proposed scheme the cloud server\ncan autonomously refine an encrypted model with newly provided encrypted\ntraining data to continuously improve its accuracy. Though the efficiency is\nstill significantly lower than the baseline scheme that refines plaintext-model\nwith plaintext-data, we expect that it can be improved by fully utilizing the\nhigher level of parallelism and the computational power of GPU at the cloud\nserver.\n","authors":["Pinglan Liu","Wensheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00963v1.pdf","comment":"IEEE INTERNATIONAL CONFERENCE ON CLOUD COMPUTING (CLOUD) 2023"},{"id":"http://arxiv.org/abs/2203.15925v3","updated":"2023-08-02T05:57:05Z","published":"2022-03-29T22:02:28Z","title":"Asynchronous, Option-Based Multi-Agent Policy Gradient: A Conditional\n  Reasoning Approach","summary":"  Cooperative multi-agent problems often require coordination between agents,\nwhich can be achieved through a centralized policy that considers the global\nstate. Multi-agent policy gradient (MAPG) methods are commonly used to learn\nsuch policies, but they are often limited to problems with low-level action\nspaces. In complex problems with large state and action spaces, it is\nadvantageous to extend MAPG methods to use higher-level actions, also known as\noptions, to improve the policy search efficiency. However, multi-robot option\nexecutions are often asynchronous, that is, agents may select and complete\ntheir options at different time steps. This makes it difficult for MAPG methods\nto derive a centralized policy and evaluate its gradient, as centralized policy\nalways select new options at the same time. In this work, we propose a novel,\nconditional reasoning approach to address this problem and demonstrate its\neffectiveness on representative option-based multi-agent cooperative tasks\nthrough empirical validation. Find code and videos at:\n\\href{https://sites.google.com/view/mahrlsupp/}{https://sites.google.com/view/mahrlsupp/}\n","authors":["Xubo Lyu","Amin Banitalebi-Dehkordi","Mo Chen","Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2203.15925v3.pdf","comment":"Accepted by IEEE/RSJ International Conference on Intelligent Robots\n  and Systems (IROS), 2023"},{"id":"http://arxiv.org/abs/2308.00957v1","updated":"2023-08-02T05:51:57Z","published":"2023-08-02T05:51:57Z","title":"Causal Inference with Differentially Private (Clustered) Outcomes","summary":"  Estimating causal effects from randomized experiments is only feasible if\nparticipants agree to reveal their potentially sensitive responses. Of the many\nways of ensuring privacy, label differential privacy is a widely used measure\nof an algorithm's privacy guarantee, which might encourage participants to\nshare responses without running the risk of de-anonymization. Many\ndifferentially private mechanisms inject noise into the original data-set to\nachieve this privacy guarantee, which increases the variance of most\nstatistical estimators and makes the precise measurement of causal effects\ndifficult: there exists a fundamental privacy-variance trade-off to performing\ncausal analyses from differentially private data. With the aim of achieving\nlower variance for stronger privacy guarantees, we suggest a new differential\nprivacy mechanism, \"Cluster-DP\", which leverages any given cluster structure of\nthe data while still allowing for the estimation of causal effects. We show\nthat, depending on an intuitive measure of cluster quality, we can improve the\nvariance loss while maintaining our privacy guarantees. We compare its\nperformance, theoretically and empirically, to that of its unclustered version\nand a more extreme uniform-prior version which does not use any of the original\nresponse distribution, both of which are special cases of the \"Cluster-DP\"\nalgorithm.\n","authors":["Adel Javanmard","Vahab Mirrokni","Jean Pouget-Abadie"],"pdf_url":"https://arxiv.org/pdf/2308.00957v1.pdf","comment":"41 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.00956v1","updated":"2023-08-02T05:47:56Z","published":"2023-08-02T05:47:56Z","title":"Curriculum Guided Domain Adaptation in the Dark","summary":"  Addressing the rising concerns of privacy and security, domain adaptation in\nthe dark aims to adapt a black-box source trained model to an unlabeled target\ndomain without access to any source data or source model parameters. The need\nfor domain adaptation of black-box predictors becomes even more pronounced to\nprotect intellectual property as deep learning based solutions are becoming\nincreasingly commercialized. Current methods distill noisy predictions on the\ntarget data obtained from the source model to the target model, and/or separate\nclean/noisy target samples before adapting using traditional noisy label\nlearning algorithms. However, these methods do not utilize the easy-to-hard\nlearning nature of the clean/noisy data splits. Also, none of the existing\nmethods are end-to-end, and require a separate fine-tuning stage and an initial\nwarmup stage. In this work, we present Curriculum Adaptation for Black-Box\n(CABB) which provides a curriculum guided adaptation approach to gradually\ntrain the target model, first on target data with high confidence (clean)\nlabels, and later on target data with noisy labels. CABB utilizes\nJensen-Shannon divergence as a better criterion for clean-noisy sample\nseparation, compared to the traditional criterion of cross entropy loss. Our\nmethod utilizes co-training of a dual-branch network to suppress error\naccumulation resulting from confirmation bias. The proposed approach is\nend-to-end trainable and does not require any extra finetuning stage, unlike\nexisting methods. Empirical results on standard domain adaptation datasets show\nthat CABB outperforms existing state-of-the-art black-box DA models and is\ncomparable to white-box domain adaptation models.\n","authors":["Chowdhury Sadman Jahan","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2308.00956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01590v3","updated":"2023-08-02T05:39:25Z","published":"2023-03-02T21:27:54Z","title":"Technical report: Graph Neural Networks go Grammatical","summary":"  This paper proposes a framework to formally link a fragment of an algebraic\nlanguage to a Graph Neural Network (GNN). It relies on Context Free Grammars\n(CFG) to organise algebraic operations into generative rules that can be\ntranslated into a GNN layer model. Since the rules and variables of a CFG\ndirectly derived from a language contain redundancies, a grammar reduction\nscheme is presented making tractable the translation into a GNN layer. Applying\nthis strategy, a grammar compliant with the third-order Weisfeiler-Lehman\n(3-WL) test is defined from MATLANG. From this 3-WL CFG, we derive a provably\n3-WL GNN model called G$^2$N$^2$. Moreover, this grammatical approach allows us\nto provide algebraic formulas to count the cycles of length up to six and\nchordal cycles at the edge level, which enlightens the counting power of 3-WL.\nSeveral experiments illustrate that G$^2$N$^2$ efficiently outperforms other\n3-WL GNNs on many downstream tasks.\n","authors":["Jason Piquenot","Aldo Moscatelli","Maxime Bérar","Pierre Héroux","Romain raveaux","Jean-Yves Ramel","Sébastien Adam"],"pdf_url":"https://arxiv.org/pdf/2303.01590v3.pdf","comment":"27 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00951v1","updated":"2023-08-02T05:20:55Z","published":"2023-08-02T05:20:55Z","title":"From Sparse to Soft Mixtures of Experts","summary":"  Sparse mixture of expert architectures (MoEs) scale model capacity without\nlarge increases in training or inference costs. Despite their success, MoEs\nsuffer from a number of issues: training instability, token dropping, inability\nto scale the number of experts, or ineffective finetuning. In this work, we\nproposeSoft MoE, a fully-differentiable sparse Transformer that addresses these\nchallenges, while maintaining the benefits of MoEs. Soft MoE performs an\nimplicit soft assignment by passing different weighted combinations of all\ninput tokens to each expert. As in other MoE works, experts in Soft MoE only\nprocess a subset of the (combined) tokens, enabling larger model capacity at\nlower inference cost. In the context of visual recognition, Soft MoE greatly\noutperforms standard Transformers (ViTs) and popular MoE variants (Tokens\nChoice and Experts Choice). For example, Soft MoE-Base/16 requires 10.5x lower\ninference cost (5.7x lower wall-clock time) than ViT-Huge/14 while matching its\nperformance after similar training. Soft MoE also scales well: Soft MoE Huge/14\nwith 128 experts in 16 MoE layers has over 40x more parameters than ViT\nHuge/14, while inference time cost grows by only 2%, and it performs\nsubstantially better.\n","authors":["Joan Puigcerver","Carlos Riquelme","Basil Mustafa","Neil Houlsby"],"pdf_url":"https://arxiv.org/pdf/2308.00951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01255v3","updated":"2023-08-02T05:09:47Z","published":"2022-06-02T19:11:27Z","title":"Compressive Fourier collocation methods for high-dimensional diffusion\n  equations with periodic boundary conditions","summary":"  High-dimensional Partial Differential Equations (PDEs) are a popular\nmathematical modelling tool, with applications ranging from finance to\ncomputational chemistry. However, standard numerical techniques for solving\nthese PDEs are typically affected by the curse of dimensionality. In this work,\nwe tackle this challenge while focusing on stationary diffusion equations\ndefined over a high-dimensional domain with periodic boundary conditions.\nInspired by recent progress in sparse function approximation in high\ndimensions, we propose a new method called compressive Fourier collocation.\nCombining ideas from compressive sensing and spectral collocation, our method\nreplaces the use of structured collocation grids with Monte Carlo sampling and\nemploys sparse recovery techniques, such as orthogonal matching pursuit and\n$\\ell^1$ minimization, to approximate the Fourier coefficients of the PDE\nsolution. We conduct a rigorous theoretical analysis showing that the\napproximation error of the proposed method is comparable with the best $s$-term\napproximation (with respect to the Fourier basis) to the solution. Using the\nrecently introduced framework of random sampling in bounded Riesz systems, our\nanalysis shows that the compressive Fourier collocation method mitigates the\ncurse of dimensionality with respect to the number of collocation points under\nsufficient conditions on the regularity of the diffusion coefficient. We also\npresent numerical experiments that illustrate the accuracy and stability of the\nmethod for the approximation of sparse and compressible solutions.\n","authors":["Weiqi Wang","Simone Brugiapaglia"],"pdf_url":"https://arxiv.org/pdf/2206.01255v3.pdf","comment":"33 pages, 9 figures"},{"id":"http://arxiv.org/abs/2010.08657v2","updated":"2023-08-02T05:07:57Z","published":"2020-10-16T22:40:28Z","title":"Class-incremental Learning with Pre-allocated Fixed Classifiers","summary":"  In class-incremental learning, a learning agent faces a stream of data with\nthe goal of learning new classes while not forgetting previous ones. Neural\nnetworks are known to suffer under this setting, as they forget previously\nacquired knowledge. To address this problem, effective methods exploit past\ndata stored in an episodic memory while expanding the final classifier nodes to\naccommodate the new classes.\n  In this work, we substitute the expanding classifier with a novel fixed\nclassifier in which a number of pre-allocated output nodes are subject to the\nclassification loss right from the beginning of the learning phase. Contrarily\nto the standard expanding classifier, this allows: (a) the output nodes of\nfuture unseen classes to firstly see negative samples since the beginning of\nlearning together with the positive samples that incrementally arrive; (b) to\nlearn features that do not change their geometric configuration as novel\nclasses are incorporated in the learning model.\n  Experiments with public datasets show that the proposed approach is as\neffective as the expanding classifier while exhibiting novel intriguing\nproperties of the internal feature representation that are otherwise\nnot-existent. Our ablation study on pre-allocating a large number of classes\nfurther validates the approach.\n","authors":["Federico Pernici","Matteo Bruni","Claudio Baecchi","Francesco Turchini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2010.08657v2.pdf","comment":"ICPR 2021 (figure fixed)"},{"id":"http://arxiv.org/abs/2308.00947v1","updated":"2023-08-02T05:02:30Z","published":"2023-08-02T05:02:30Z","title":"Decomposing and Coupling Saliency Map for Lesion Segmentation in\n  Ultrasound Images","summary":"  Complex scenario of ultrasound image, in which adjacent tissues (i.e.,\nbackground) share similar intensity with and even contain richer texture\npatterns than lesion region (i.e., foreground), brings a unique challenge for\naccurate lesion segmentation. This work presents a decomposition-coupling\nnetwork, called DC-Net, to deal with this challenge in a\n(foreground-background) saliency map disentanglement-fusion manner. The DC-Net\nconsists of decomposition and coupling subnets, and the former preliminarily\ndisentangles original image into foreground and background saliency maps,\nfollowed by the latter for accurate segmentation under the assistance of\nsaliency prior fusion. The coupling subnet involves three aspects of fusion\nstrategies, including: 1) regional feature aggregation (via differentiable\ncontext pooling operator in the encoder) to adaptively preserve local\ncontextual details with the larger receptive field during dimension reduction;\n2) relation-aware representation fusion (via cross-correlation fusion module in\nthe decoder) to efficiently fuse low-level visual characteristics and\nhigh-level semantic features during resolution restoration; 3) dependency-aware\nprior incorporation (via coupler) to reinforce foreground-salient\nrepresentation with the complementary information derived from background\nrepresentation. Furthermore, a harmonic loss function is introduced to\nencourage the network to focus more attention on low-confidence and hard\nsamples. The proposed method is evaluated on two ultrasound lesion segmentation\ntasks, which demonstrates the remarkable performance improvement over existing\nstate-of-the-art methods.\n","authors":["Zhenyuan Ning","Yixiao Mao","Qianjin Feng","Shengzhou Zhong","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00947v1.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2308.00942v1","updated":"2023-08-02T04:52:06Z","published":"2023-08-02T04:52:06Z","title":"On the use of deep learning for phase recovery","summary":"  Phase recovery (PR) refers to calculating the phase of the light field from\nits intensity measurements. As exemplified from quantitative phase imaging and\ncoherent diffraction imaging to adaptive optics, PR is essential for\nreconstructing the refractive index distribution or topography of an object and\ncorrecting the aberration of an imaging system. In recent years, deep learning\n(DL), often implemented through deep neural networks, has provided\nunprecedented support for computational imaging, leading to more efficient\nsolutions for various PR problems. In this review, we first briefly introduce\nconventional methods for PR. Then, we review how DL provides support for PR\nfrom the following three stages, namely, pre-processing, in-processing, and\npost-processing. We also review how DL is used in phase image processing.\nFinally, we summarize the work in DL for PR and outlook on how to better use DL\nto improve the reliability and efficiency in PR. Furthermore, we present a\nlive-updating resource (https://github.com/kqwang/phase-recovery) for readers\nto learn more about PR.\n","authors":["Kaiqiang Wang","Li Song","Chutian Wang","Zhenbo Ren","Guangyuan Zhao","Jiazhen Dou","Jianglei Di","George Barbastathis","Renjie Zhou","Jianlin Zhao","Edmund Y. Lam"],"pdf_url":"https://arxiv.org/pdf/2308.00942v1.pdf","comment":"82 pages, 32 figures"},{"id":"http://arxiv.org/abs/2308.00928v1","updated":"2023-08-02T04:06:16Z","published":"2023-08-02T04:06:16Z","title":"QUANT: A Minimalist Interval Method for Time Series Classification","summary":"  We show that it is possible to achieve the same accuracy, on average, as the\nmost accurate existing interval methods for time series classification on a\nstandard set of benchmark datasets using a single type of feature (quantiles),\nfixed intervals, and an 'off the shelf' classifier. This distillation of\ninterval-based approaches represents a fast and accurate method for time series\nclassification, achieving state-of-the-art accuracy on the expanded set of 142\ndatasets in the UCR archive with a total compute time (training and inference)\nof less than 15 minutes using a single CPU core.\n","authors":["Angus Dempster","Daniel F. Schmidt","Geoffrey I. Webb"],"pdf_url":"https://arxiv.org/pdf/2308.00928v1.pdf","comment":"26 pages, 20 figures"},{"id":"http://arxiv.org/abs/2304.10970v4","updated":"2023-08-02T03:59:34Z","published":"2023-04-21T14:06:44Z","title":"Can GPT-4 Perform Neural Architecture Search?","summary":"  We investigate the potential of GPT-4~\\cite{gpt4} to perform Neural\nArchitecture Search (NAS) -- the task of designing effective neural\narchitectures. Our proposed approach, \\textbf{G}PT-4 \\textbf{E}nhanced\n\\textbf{N}eural arch\\textbf{I}tect\\textbf{U}re \\textbf{S}earch (GENIUS),\nleverages the generative capabilities of GPT-4 as a black-box optimiser to\nquickly navigate the architecture search space, pinpoint promising candidates,\nand iteratively refine these candidates to improve performance. We assess\nGENIUS across several benchmarks, comparing it with existing state-of-the-art\nNAS techniques to illustrate its effectiveness. Rather than targeting\nstate-of-the-art performance, our objective is to highlight GPT-4's potential\nto assist research on a challenging technical problem through a simple\nprompting scheme that requires relatively limited domain\nexpertise\\footnote{Code available at\n\\href{https://github.com/mingkai-zheng/GENIUS}{https://github.com/mingkai-zheng/GENIUS}.}.\nMore broadly, we believe our preliminary results point to future research that\nharnesses general purpose language models for diverse optimisation tasks. We\nalso highlight important limitations to our study, and note implications for AI\nsafety.\n","authors":["Mingkai Zheng","Xiu Su","Shan You","Fei Wang","Chen Qian","Chang Xu","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2304.10970v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01486v3","updated":"2023-08-02T03:50:54Z","published":"2023-03-02T18:47:51Z","title":"Understanding plasticity in neural networks","summary":"  Plasticity, the ability of a neural network to quickly change its predictions\nin response to new information, is essential for the adaptability and\nrobustness of deep reinforcement learning systems. Deep neural networks are\nknown to lose plasticity over the course of training even in relatively simple\nlearning problems, but the mechanisms driving this phenomenon are still poorly\nunderstood. This paper conducts a systematic empirical analysis into plasticity\nloss, with the goal of understanding the phenomenon mechanistically in order to\nguide the future development of targeted solutions. We find that loss of\nplasticity is deeply connected to changes in the curvature of the loss\nlandscape, but that it often occurs in the absence of saturated units. Based on\nthis insight, we identify a number of parameterization and optimization design\nchoices which enable networks to better preserve plasticity over the course of\ntraining. We validate the utility of these findings on larger-scale RL\nbenchmarks in the Arcade Learning Environment.\n","authors":["Clare Lyle","Zeyu Zheng","Evgenii Nikishin","Bernardo Avila Pires","Razvan Pascanu","Will Dabney"],"pdf_url":"https://arxiv.org/pdf/2303.01486v3.pdf","comment":"Accepted to ICML 2023 (oral presentation)"},{"id":"http://arxiv.org/abs/2308.00924v1","updated":"2023-08-02T03:47:19Z","published":"2023-08-02T03:47:19Z","title":"Continual Domain Adaptation on Aerial Images under Gradually Degrading\n  Weather","summary":"  Domain adaptation (DA) strives to mitigate the domain gap between the source\ndomain where a model is trained, and the target domain where the model is\ndeployed. When a deep learning model is deployed on an aerial platform, it may\nface gradually degrading weather conditions during operation, leading to\nwidening domain gaps between the training data and the encountered evaluation\ndata. We synthesize two such gradually worsening weather conditions on real\nimages from two existing aerial imagery datasets, generating a total of four\nbenchmark datasets. Under the continual, or test-time adaptation setting, we\nevaluate three DA models on our datasets: a baseline standard DA model and two\ncontinual DA models. In such setting, the models can access only one small\nportion, or one batch of the target data at a time, and adaptation takes place\ncontinually, and over only one epoch of the data. The combination of the\nconstraints of continual adaptation, and gradually deteriorating weather\nconditions provide the practical DA scenario for aerial deployment. Among the\nevaluated models, we consider both convolutional and transformer architectures\nfor comparison. We discover stability issues during adaptation for existing\nbuffer-fed continual DA methods, and offer gradient normalization as a simple\nsolution to curb training instability.\n","authors":["Chowdhury Sadman Jahan","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2308.00924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08549v2","updated":"2023-08-02T03:33:37Z","published":"2022-10-16T14:20:11Z","title":"Automatic Emergency Dust-Free solution on-board International Space\n  Station with Bi-GRU (AED-ISS)","summary":"  With a rising attention for the issue of PM2.5 or PM0.3, particulate matters\nhave become not only a potential threat to both the environment and human, but\nalso a harming existence to instruments onboard International Space Station\n(ISS). Our team is aiming to relate various concentration of particulate\nmatters to magnetic fields, humidity, acceleration, temperature, pressure and\nCO2 concentration. Our goal is to establish an early warning system (EWS),\nwhich is able to forecast the levels of particulate matters and provides ample\nreaction time for astronauts to protect their instruments in some experiments\nor increase the accuracy of the measurements; In addition, the constructed\nmodel can be further developed into a prototype of a remote-sensing smoke alarm\nfor applications related to fires. In this article, we will implement the\nBi-GRU (Bidirectional Gated Recurrent Unit) algorithms that collect data for\npast 90 minutes and predict the levels of particulates which over 2.5\nmicrometer per 0.1 liter for the next 1 minute, which is classified as an early\nwarning\n","authors":["Po-Han Hou","Wei-Chih Lin","Hong-Chun Hou","Yu-Hao Huang","Jih-Hong Shue"],"pdf_url":"https://arxiv.org/pdf/2210.08549v2.pdf","comment":"11 pages, 5 figures, and 1 table"},{"id":"http://arxiv.org/abs/2308.00920v1","updated":"2023-08-02T03:31:22Z","published":"2023-08-02T03:31:22Z","title":"Virtual histological staining of unlabeled autopsy tissue","summary":"  Histological examination is a crucial step in an autopsy; however, the\ntraditional histochemical staining of post-mortem samples faces multiple\nchallenges, including the inferior staining quality due to autolysis caused by\ndelayed fixation of cadaver tissue, as well as the resource-intensive nature of\nchemical staining procedures covering large tissue areas, which demand\nsubstantial labor, cost, and time. These challenges can become more pronounced\nduring global health crises when the availability of histopathology services is\nlimited, resulting in further delays in tissue fixation and more severe\nstaining artifacts. Here, we report the first demonstration of virtual staining\nof autopsy tissue and show that a trained neural network can rapidly transform\nautofluorescence images of label-free autopsy tissue sections into brightfield\nequivalent images that match hematoxylin and eosin (H&E) stained versions of\nthe same samples, eliminating autolysis-induced severe staining artifacts\ninherent in traditional histochemical staining of autopsied tissue. Our virtual\nH&E model was trained using >0.7 TB of image data and a data-efficient\ncollaboration scheme that integrates the virtual staining network with an image\nregistration network. The trained model effectively accentuated nuclear,\ncytoplasmic and extracellular features in new autopsy tissue samples that\nexperienced severe autolysis, such as COVID-19 samples never seen before, where\nthe traditional histochemical staining failed to provide consistent staining\nquality. This virtual autopsy staining technique can also be extended to\nnecrotic tissue, and can rapidly and cost-effectively generate artifact-free\nH&E stains despite severe autolysis and cell death, also reducing labor, cost\nand infrastructure requirements associated with the standard histochemical\nstaining.\n","authors":["Yuzhu Li","Nir Pillar","Jingxi Li","Tairan Liu","Di Wu","Songyu Sun","Guangdong Ma","Kevin de Haan","Luzhe Huang","Sepehr Hamidi","Anatoly Urisman","Tal Keidar Haran","William Dean Wallace","Jonathan E. Zuckerman","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2308.00920v1.pdf","comment":"24 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2306.15865v3","updated":"2023-08-02T03:27:58Z","published":"2023-06-28T01:41:30Z","title":"Differentially Private Distributed Estimation and Learning","summary":"  We study distributed estimation and learning problems in a networked\nenvironment in which agents exchange information to estimate unknown\nstatistical properties of random variables from their privately observed\nsamples. By exchanging information about their private observations, the agents\ncan collectively estimate the unknown quantities, but they also face privacy\nrisks. The goal of our aggregation schemes is to combine the observed data\nefficiently over time and across the network, while accommodating the privacy\nneeds of the agents and without any coordination beyond their local\nneighborhoods. Our algorithms enable the participating agents to estimate a\ncomplete sufficient statistic from private signals that are acquired offline or\nonline over time, and to preserve the privacy of their signals and network\nneighborhoods. This is achieved through linear aggregation schemes with\nadjusted randomization schemes that add noise to the exchanged estimates\nsubject to differential privacy (DP) constraints. In every case, we demonstrate\nthe efficiency of our algorithms by proving convergence to the estimators of a\nhypothetical, omniscient observer that has central access to all of the\nsignals. We also provide convergence rate analysis and finite-time performance\nguarantees and show that the noise that minimizes the convergence time to the\nbest estimates is the Laplace noise, with parameters corresponding to each\nagent's sensitivity to their signal and network characteristics. Finally, to\nsupplement and validate our theoretical results, we run experiments on\nreal-world data from the US Power Grid Network and electric consumption data\nfrom German Households to estimate the average power consumption of power\nstations and households under all privacy regimes.\n","authors":["Marios Papachristou","M. Amin Rahimian"],"pdf_url":"https://arxiv.org/pdf/2306.15865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12351v4","updated":"2023-08-02T03:20:46Z","published":"2021-10-24T04:49:35Z","title":"Integrated Conditional Estimation-Optimization","summary":"  Many real-world optimization problems involve uncertain parameters with\nprobability distributions that can be estimated using contextual feature\ninformation. In contrast to the standard approach of first estimating the\ndistribution of uncertain parameters and then optimizing the objective based on\nthe estimation, we propose an integrated conditional estimation-optimization\n(ICEO) framework that estimates the underlying conditional distribution of the\nrandom parameter while considering the structure of the optimization problem.\nWe directly model the relationship between the conditional distribution of the\nrandom parameter and the contextual features, and then estimate the\nprobabilistic model with an objective that aligns with the downstream\noptimization problem. We show that our ICEO approach is asymptotically\nconsistent under moderate regularity conditions and further provide finite\nperformance guarantees in the form of generalization bounds. Computationally,\nperforming estimation with the ICEO approach is a non-convex and often\nnon-differentiable optimization problem. We propose a general methodology for\napproximating the potentially non-differentiable mapping from estimated\nconditional distribution to the optimal decision by a differentiable function,\nwhich greatly improves the performance of gradient-based algorithms applied to\nthe non-convex problem. We also provide a polynomial optimization solution\napproach in the semi-algebraic case. Numerical experiments are also conducted\nto show the empirical success of our approach in different situations including\nwith limited data samples and model mismatches.\n","authors":["Meng Qi","Paul Grigas","Zuo-Jun Max Shen"],"pdf_url":"https://arxiv.org/pdf/2110.12351v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00904v1","updated":"2023-08-02T01:44:30Z","published":"2023-08-02T01:44:30Z","title":"VLUCI: Variational Learning of Unobserved Confounders for Counterfactual\n  Inference","summary":"  Causal inference plays a vital role in diverse domains like epidemiology,\nhealthcare, and economics. De-confounding and counterfactual prediction in\nobservational data has emerged as a prominent concern in causal inference\nresearch. While existing models tackle observed confounders, the presence of\nunobserved confounders remains a significant challenge, distorting causal\ninference and impacting counterfactual outcome accuracy. To address this, we\npropose a novel variational learning model of unobserved confounders for\ncounterfactual inference (VLUCI), which generates the posterior distribution of\nunobserved confounders. VLUCI relaxes the unconfoundedness assumption often\noverlooked by most causal inference methods. By disentangling observed and\nunobserved confounders, VLUCI constructs a doubly variational inference model\nto approximate the distribution of unobserved confounders, which are used for\ninferring more accurate counterfactual outcomes. Extensive experiments on\nsynthetic and semi-synthetic datasets demonstrate VLUCI's superior performance\nin inferring unobserved confounders. It is compatible with state-of-the-art\ncounterfactual inference models, significantly improving inference accuracy at\nboth group and individual levels. Additionally, VLUCI provides confidence\nintervals for counterfactual outcomes, aiding decision-making in risk-sensitive\ndomains. We further clarify the considerations when applying VLUCI to cases\nwhere unobserved confounders don't strictly conform to our model assumptions\nusing the public IHDP dataset as an example, highlighting the practical\nadvantages of VLUCI.\n","authors":["Yonghe Zhao","Qiang Huang","Siwei Wu","Yun Peng","Huiyan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.00904v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.00365v2","updated":"2023-08-02T01:34:15Z","published":"2023-04-30T01:52:19Z","title":"A Transfer Learning Approach to Minimize Reinforcement Learning Risks in\n  Energy Optimization for Smart Buildings","summary":"  Energy optimization leveraging artificially intelligent algorithms has been\nproven effective. However, when buildings are commissioned, there is no\nhistorical data that could be used to train these algorithms. On-line\nReinforcement Learning (RL) algorithms have shown significant promise, but\ntheir deployment carries a significant risk, because as the RL agent initially\nexplores its action space it could cause significant discomfort to the building\nresidents. In this paper we present ReLBOT - a new technique that uses transfer\nlearning in conjunction with deep RL to transfer knowledge from an existing,\noptimized and instrumented building, to the newly commissioning smart building,\nto reduce the adverse impact of the reinforcement learning agent's warm-up\nperiod. We demonstrate improvements of up to 6.2 times in the duration, and up\nto 132 times in prediction variance, for the reinforcement learning agent's\nwarm-up period.\n","authors":["Mikhail Genkin","J. J. McArthur"],"pdf_url":"https://arxiv.org/pdf/2305.00365v2.pdf","comment":"31 pages, 9 figures, submitted to the journal Energy and Buildings"},{"id":"http://arxiv.org/abs/2308.00894v1","updated":"2023-08-02T01:13:36Z","published":"2023-08-02T01:13:36Z","title":"User-Controllable Recommendation via Counterfactual Retrospective and\n  Prospective Explanations","summary":"  Modern recommender systems utilize users' historical behaviors to generate\npersonalized recommendations. However, these systems often lack user\ncontrollability, leading to diminished user satisfaction and trust in the\nsystems. Acknowledging the recent advancements in explainable recommender\nsystems that enhance users' understanding of recommendation mechanisms, we\npropose leveraging these advancements to improve user controllability. In this\npaper, we present a user-controllable recommender system that seamlessly\nintegrates explainability and controllability within a unified framework. By\nproviding both retrospective and prospective explanations through\ncounterfactual reasoning, users can customize their control over the system by\ninteracting with these explanations.\n  Furthermore, we introduce and assess two attributes of controllability in\nrecommendation systems: the complexity of controllability and the accuracy of\ncontrollability. Experimental evaluations on MovieLens and Yelp datasets\nsubstantiate the effectiveness of our proposed framework. Additionally, our\nexperiments demonstrate that offering users control options can potentially\nenhance recommendation accuracy in the future. Source code and data are\navailable at \\url{https://github.com/chrisjtan/ucr}.\n","authors":["Juntao Tan","Yingqiang Ge","Yan Zhu","Yinglong Xia","Jiebo Luo","Jianchao Ji","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00894v1.pdf","comment":"Accepted for presentation at 26th European Conference on Artificial\n  Intelligence (ECAI2023)"},{"id":"http://arxiv.org/abs/2308.00890v1","updated":"2023-08-02T00:51:37Z","published":"2023-08-02T00:51:37Z","title":"Tango: rethinking quantization for graph neural network training on GPUs","summary":"  Graph Neural Networks (GNNs) are becoming increasingly popular due to their\nsuperior performance in critical graph-related tasks. While quantization is\nwidely used to accelerate GNN computation, quantized training faces\nunprecedented challenges. Current quantized GNN training systems often have\nlonger training times than their full-precision counterparts for two reasons:\n(i) addressing the accuracy challenge leads to excessive overhead, and (ii) the\noptimization potential exposed by quantization is not adequately leveraged.\nThis paper introduces Tango which re-thinks quantization challenges and\nopportunities for graph neural network training on GPUs with three\ncontributions: Firstly, we introduce efficient rules to maintain accuracy\nduring quantized GNN training. Secondly, we design and implement\nquantization-aware primitives and inter-primitive optimizations that can speed\nup GNN training. Finally, we integrate Tango with the popular Deep Graph\nLibrary (DGL) system and demonstrate its superior performance over\nstate-of-the-art approaches on various GNN models and datasets.\n","authors":["Shiyang Chen","Da Zheng","Caiwen Ding","Chengying Huan","Yuede Ji","Hang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.00890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1706.03762v7","updated":"2023-08-02T00:41:18Z","published":"2017-06-12T17:57:34Z","title":"Attention Is All You Need","summary":"  The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntranslation task, our model establishes a new single-model state-of-the-art\nBLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction\nof the training costs of the best models from the literature. We show that the\nTransformer generalizes well to other tasks by applying it successfully to\nEnglish constituency parsing both with large and limited training data.\n","authors":["Ashish Vaswani","Noam Shazeer","Niki Parmar","Jakob Uszkoreit","Llion Jones","Aidan N. Gomez","Lukasz Kaiser","Illia Polosukhin"],"pdf_url":"https://arxiv.org/pdf/1706.03762v7.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.16039v2","updated":"2023-08-02T00:39:25Z","published":"2023-07-29T18:01:46Z","title":"Okapi: Instruction-tuned Large Language Models in Multiple Languages\n  with Reinforcement Learning from Human Feedback","summary":"  A key technology for the development of large language models (LLMs) involves\ninstruction tuning that helps align the models' responses with human\nexpectations to realize impressive learning abilities. Two major approaches for\ninstruction tuning characterize supervised fine-tuning (SFT) and reinforcement\nlearning from human feedback (RLHF), which are currently applied to produce the\nbest commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for\nresearch and development efforts, various instruction-tuned open-source LLMs\nhave also been introduced recently, e.g., Alpaca, Vicuna, to name a few.\nHowever, existing open-source LLMs have only been instruction-tuned for English\nand a few popular languages, thus hindering their impacts and accessibility to\nmany other languages in the world. Among a few very recent work to explore\ninstruction tuning for LLMs in multiple languages, SFT has been used as the\nonly approach to instruction-tune LLMs for multiple languages. This has left a\nsignificant gap for fine-tuned LLMs based on RLHF in diverse languages and\nraised important questions on how RLHF can boost the performance of\nmultilingual instruction tuning. To overcome this issue, we present Okapi, the\nfirst system with instruction-tuned LLMs based on RLHF for multiple languages.\nOkapi introduces instruction and response-ranked data in 26 diverse languages\nto facilitate the experiments and development of future multilingual LLM\nresearch. We also present benchmark datasets to enable the evaluation of\ngenerative LLMs in multiple languages. Our experiments demonstrate the\nadvantages of RLHF for multilingual instruction over SFT for different base\nmodels and datasets. Our framework and resources are released at\nhttps://github.com/nlp-uoregon/Okapi.\n","authors":["Viet Dac Lai","Chien Van Nguyen","Nghia Trung Ngo","Thuat Nguyen","Franck Dernoncourt","Ryan A. Rossi","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.16039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00887v1","updated":"2023-08-02T00:32:02Z","published":"2023-08-02T00:32:02Z","title":"Factor Graph Neural Networks","summary":"  In recent years, we have witnessed a surge of Graph Neural Networks (GNNs),\nmost of which can learn powerful representations in an end-to-end fashion with\ngreat success in many real-world applications. They have resemblance to\nProbabilistic Graphical Models (PGMs), but break free from some limitations of\nPGMs. By aiming to provide expressive methods for representation learning\ninstead of computing marginals or most likely configurations, GNNs provide\nflexibility in the choice of information flowing rules while maintaining good\nperformance. Despite their success and inspirations, they lack efficient ways\nto represent and learn higher-order relations among variables/nodes. More\nexpressive higher-order GNNs which operate on k-tuples of nodes need increased\ncomputational resources in order to process higher-order tensors. We propose\nFactor Graph Neural Networks (FGNNs) to effectively capture higher-order\nrelations for inference and learning. To do so, we first derive an efficient\napproximate Sum-Product loopy belief propagation inference algorithm for\ndiscrete higher-order PGMs. We then neuralize the novel message passing scheme\ninto a Factor Graph Neural Network (FGNN) module by allowing richer\nrepresentations of the message update rules; this facilitates both efficient\ninference and powerful end-to-end learning. We further show that with a\nsuitable choice of message aggregation operators, our FGNN is also able to\nrepresent Max-Product belief propagation, providing a single family of\narchitecture that can represent both Max and Sum-Product loopy belief\npropagation. Our extensive experimental evaluation on synthetic as well as real\ndatasets demonstrates the potential of the proposed model.\n","authors":["Zhen Zhang","Mohammed Haroon Dupty","Fan Wu","Javen Qinfeng Shi","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2308.00887v1.pdf","comment":"Accepted by JMLR"},{"id":"http://arxiv.org/abs/2308.00886v1","updated":"2023-08-02T00:28:22Z","published":"2023-08-02T00:28:22Z","title":"Enhancing Machine Learning Performance with Continuous In-Session Ground\n  Truth Scores: Pilot Study on Objective Skeletal Muscle Pain Intensity\n  Prediction","summary":"  Machine learning (ML) models trained on subjective self-report scores\nstruggle to objectively classify pain accurately due to the significant\nvariance between real-time pain experiences and recorded scores afterwards.\nThis study developed two devices for acquisition of real-time, continuous\nin-session pain scores and gathering of ANS-modulated endodermal activity\n(EDA).The experiment recruited N = 24 subjects who underwent a post-exercise\ncirculatory occlusion (PECO) with stretch, inducing discomfort. Subject data\nwere stored in a custom pain platform, facilitating extraction of time-domain\nEDA features and in-session ground truth scores. Moreover, post-experiment\nvisual analog scale (VAS) scores were collected from each subject. Machine\nlearning models, namely Multi-layer Perceptron (MLP) and Random Forest (RF),\nwere trained using corresponding objective EDA features combined with\nin-session scores and post-session scores, respectively. Over a 10-fold\ncross-validation, the macro-averaged geometric mean score revealed MLP and RF\nmodels trained with objective EDA features and in-session scores achieved\nsuperior performance (75.9% and 78.3%) compared to models trained with\npost-session scores (70.3% and 74.6%) respectively. This pioneering study\ndemonstrates that using continuous in-session ground truth scores significantly\nenhances ML performance in pain intensity characterization, overcoming ground\ntruth sparsity-related issues, data imbalance, and high variance. This study\ninforms future objective-based ML pain system training.\n","authors":["Boluwatife E. Faremi","Jonathon Stavres","Nuno Oliveira","Zhaoxian Zhou","Andrew H. Sung"],"pdf_url":"https://arxiv.org/pdf/2308.00886v1.pdf","comment":"18 pages, 2-page Appendix, 7 figures"},{"id":"http://arxiv.org/abs/2308.00127v2","updated":"2023-08-02T00:21:42Z","published":"2023-07-31T19:46:49Z","title":"DiviML: A Module-based Heuristic for Mapping Neural Networks onto\n  Heterogeneous Platforms","summary":"  Datacenters are increasingly becoming heterogeneous, and are starting to\ninclude specialized hardware for networking, video processing, and especially\ndeep learning. To leverage the heterogeneous compute capability of modern\ndatacenters, we develop an approach for compiler-level partitioning of deep\nneural networks (DNNs) onto multiple interconnected hardware devices. We\npresent a general framework for heterogeneous DNN compilation, offering\nautomatic partitioning and device mapping. Our scheduler integrates both an\nexact solver, through a mixed integer linear programming (MILP) formulation,\nand a modularity-based heuristic for scalability. Furthermore, we propose a\ntheoretical lower bound formula for the optimal solution, which enables the\nassessment of the heuristic solutions' quality. We evaluate our scheduler in\noptimizing both conventional DNNs and randomly-wired neural networks, subject\nto latency and throughput constraints, on a heterogeneous system comprised of a\nCPU and two distinct GPUs. Compared to na\\\"ively running DNNs on the fastest\nGPU, he proposed framework can achieve more than 3$\\times$ times lower latency\nand up to 2.9$\\times$ higher throughput by automatically leveraging both data\nand model parallelism to deploy DNNs on our sample heterogeneous server node.\nMoreover, our modularity-based \"splitting\" heuristic improves the solution\nruntime up to 395$\\times$ without noticeably sacrificing solution quality\ncompared to an exact MILP solution, and outperforms all other heuristics by\n30-60% solution quality. Finally, our case study shows how we can extend our\nframework to schedule large language models across multiple heterogeneous\nservers by exploiting symmetry in the hardware setup. Our code can be easily\nplugged in to existing frameworks, and is available at\nhttps://github.com/abdelfattah-lab/diviml.\n","authors":["Yassine Ghannane","Mohamed S. Abdelfattah"],"pdf_url":"https://arxiv.org/pdf/2308.00127v2.pdf","comment":"accepted at ICCAD'23"},{"id":"http://arxiv.org/abs/2308.01475v1","updated":"2023-08-02T23:57:31Z","published":"2023-08-02T23:57:31Z","title":"Interpretable Machine Learning for Discovery: Statistical Challenges \\&\n  Opportunities","summary":"  New technologies have led to vast troves of large and complex datasets across\nmany scientific domains and industries. People routinely use machine learning\ntechniques to not only process, visualize, and make predictions from this big\ndata, but also to make data-driven discoveries. These discoveries are often\nmade using Interpretable Machine Learning, or machine learning models and\ntechniques that yield human understandable insights. In this paper, we discuss\nand review the field of interpretable machine learning, focusing especially on\nthe techniques as they are often employed to generate new knowledge or make\ndiscoveries from large data sets. We outline the types of discoveries that can\nbe made using Interpretable Machine Learning in both supervised and\nunsupervised settings. Additionally, we focus on the grand challenge of how to\nvalidate these discoveries in a data-driven manner, which promotes trust in\nmachine learning systems and reproducibility in science. We discuss validation\nfrom both a practical perspective, reviewing approaches based on data-splitting\nand stability, as well as from a theoretical perspective, reviewing statistical\nresults on model selection consistency and uncertainty quantification via\nstatistical inference. Finally, we conclude by highlighting open challenges in\nusing interpretable machine learning techniques to make discoveries, including\ngaps between theory and practice for validating data-driven-discoveries.\n","authors":["Genevera I. Allen","Luqin Gan","Lili Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.01475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01472v1","updated":"2023-08-02T23:39:29Z","published":"2023-08-02T23:39:29Z","title":"Reverse Stable Diffusion: What prompt was used to generate this image?","summary":"  Text-to-image diffusion models such as Stable Diffusion have recently\nattracted the interest of many researchers, and inverting the diffusion process\ncan play an important role in better understanding the generative process and\nhow to engineer prompts in order to obtain the desired images. To this end, we\nintroduce the new task of predicting the text prompt given an image generated\nby a generative diffusion model. We combine a series of white-box and black-box\nmodels (with and without access to the weights of the diffusion network) to\ndeal with the proposed task. We propose a novel learning framework comprising\nof a joint prompt regression and multi-label vocabulary classification\nobjective that generates improved prompts. To further improve our method, we\nemploy a curriculum learning procedure that promotes the learning of\nimage-prompt pairs with lower labeling noise (i.e. that are better aligned),\nand an unsupervised domain-adaptive kernel learning method that uses the\nsimilarities between samples in the source and target domains as extra\nfeatures. We conduct experiments on the DiffusionDB data set, predicting text\nprompts from images generated by Stable Diffusion. Our novel learning framework\nproduces excellent results on the aforementioned task, yielding the highest\ngains when applied on the white-box model. In addition, we make an interesting\ndiscovery: training a diffusion model on the prompt generation task can make\nthe model generate images that are much better aligned with the input prompts,\nwhen the model is directly reused for text-to-image generation.\n","authors":["Florinel-Alin Croitoru","Vlad Hondru","Radu Tudor Ionescu","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.01472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01471v1","updated":"2023-08-02T23:39:24Z","published":"2023-08-02T23:39:24Z","title":"Implicit Occupancy Flow Fields for Perception and Prediction in\n  Self-Driving","summary":"  A self-driving vehicle (SDV) must be able to perceive its surroundings and\npredict the future behavior of other traffic participants. Existing works\neither perform object detection followed by trajectory forecasting of the\ndetected objects, or predict dense occupancy and flow grids for the whole\nscene. The former poses a safety concern as the number of detections needs to\nbe kept low for efficiency reasons, sacrificing object recall. The latter is\ncomputationally expensive due to the high-dimensionality of the output grid,\nand suffers from the limited receptive field inherent to fully convolutional\nnetworks. Furthermore, both approaches employ many computational resources\npredicting areas or objects that might never be queried by the motion planner.\nThis motivates our unified approach to perception and future prediction that\nimplicitly represents occupancy and flow over time with a single neural\nnetwork. Our method avoids unnecessary computation, as it can be directly\nqueried by the motion planner at continuous spatio-temporal locations.\nMoreover, we design an architecture that overcomes the limited receptive field\nof previous explicit occupancy prediction methods by adding an efficient yet\neffective global attention mechanism. Through extensive experiments in both\nurban and highway settings, we demonstrate that our implicit model outperforms\nthe current state-of-the-art. For more information, visit the project website:\nhttps://waabi.ai/research/implicito.\n","authors":["Ben Agro","Quinlan Sykora","Sergio Casas","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2308.01471v1.pdf","comment":"19 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.01469v1","updated":"2023-08-02T23:13:49Z","published":"2023-08-02T23:13:49Z","title":"VertexSerum: Poisoning Graph Neural Networks for Link Inference","summary":"  Graph neural networks (GNNs) have brought superb performance to various\napplications utilizing graph structural data, such as social analysis and fraud\ndetection. The graph links, e.g., social relationships and transaction history,\nare sensitive and valuable information, which raises privacy concerns when\nusing GNNs. To exploit these vulnerabilities, we propose VertexSerum, a novel\ngraph poisoning attack that increases the effectiveness of graph link stealing\nby amplifying the link connectivity leakage. To infer node adjacency more\naccurately, we propose an attention mechanism that can be embedded into the\nlink detection network. Our experiments demonstrate that VertexSerum\nsignificantly outperforms the SOTA link inference attack, improving the AUC\nscores by an average of $9.8\\%$ across four real-world datasets and three\ndifferent GNN structures. Furthermore, our experiments reveal the effectiveness\nof VertexSerum in both black-box and online learning settings, further\nvalidating its applicability in real-world scenarios.\n","authors":["Ruyi Ding","Shijin Duan","Xiaolin Xu","Yunsi Fei"],"pdf_url":"https://arxiv.org/pdf/2308.01469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04577v2","updated":"2023-08-02T22:14:06Z","published":"2023-07-10T14:11:07Z","title":"AnyTeleop: A General Vision-Based Dexterous Robot Arm-Hand Teleoperation\n  System","summary":"  Vision-based teleoperation offers the possibility to endow robots with\nhuman-level intelligence to physically interact with the environment, while\nonly requiring low-cost camera sensors. However, current vision-based\nteleoperation systems are designed and engineered towards a particular robot\nmodel and deploy environment, which scales poorly as the pool of the robot\nmodels expands and the variety of the operating environment increases. In this\npaper, we propose AnyTeleop, a unified and general teleoperation system to\nsupport multiple different arms, hands, realities, and camera configurations\nwithin a single system. Although being designed to provide great flexibility to\nthe choice of simulators and real hardware, our system can still achieve great\nperformance. For real-world experiments, AnyTeleop can outperform a previous\nsystem that was designed for a specific robot hardware with a higher success\nrate, using the same robot. For teleoperation in simulation, AnyTeleop leads to\nbetter imitation learning performance, compared with a previous system that is\nparticularly designed for that simulator. Project page: http://anyteleop.com/.\n","authors":["Yuzhe Qin","Wei Yang","Binghao Huang","Karl Van Wyk","Hao Su","Xiaolong Wang","Yu-Wei Chao","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2307.04577v2.pdf","comment":"http://anyteleop.com/ Robotics: Science and Systems 2023"},{"id":"http://arxiv.org/abs/2308.01445v1","updated":"2023-08-02T21:38:36Z","published":"2023-08-02T21:38:36Z","title":"A digital twin framework for civil engineering structures","summary":"  The digital twin concept represents an appealing opportunity to advance\ncondition-based and predictive maintenance paradigms for civil engineering\nsystems, thus allowing reduced lifecycle costs, increased system safety, and\nincreased system availability. This work proposes a predictive digital twin\napproach to the health monitoring, maintenance, and management planning of\ncivil engineering structures. The asset-twin coupled dynamical system is\nencoded employing a probabilistic graphical model, which allows all relevant\nsources of uncertainty to be taken into account. In particular, the\ntime-repeating observations-to-decisions flow is modeled using a dynamic\nBayesian network. Real-time structural health diagnostics are provided by\nassimilating sensed data with deep learning models. The digital twin state is\ncontinually updated in a sequential Bayesian inference fashion. This is then\nexploited to inform the optimal planning of maintenance and management actions\nwithin a dynamic decision-making framework. A preliminary offline phase\ninvolves the population of training datasets through a reduced-order numerical\nmodel and the computation of a health-dependent control policy. The strategy is\nassessed on two synthetic case studies, involving a cantilever beam and a\nrailway bridge, demonstrating the dynamic decision-making capabilities of\nhealth-aware digital twins.\n","authors":["Matteo Torzoni","Marco Tezzele","Stefano Mariani","Andrea Manzoni","Karen E. Willcox"],"pdf_url":"https://arxiv.org/pdf/2308.01445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12344v2","updated":"2023-08-02T21:22:26Z","published":"2023-06-21T15:41:34Z","title":"An efficient, provably exact, practical algorithm for the 0-1 loss\n  linear classification problem","summary":"  Algorithms for solving the linear classification problem have a long history,\ndating back at least to 1936 with linear discriminant analysis. For linearly\nseparable data, many algorithms can obtain the exact solution to the\ncorresponding 0-1 loss classification problem efficiently, but for data which\nis not linearly separable, it has been shown that this problem, in full\ngenerality, is NP-hard. Alternative approaches all involve approximations of\nsome kind, including the use of surrogates for the 0-1 loss (for example, the\nhinge or logistic loss) or approximate combinatorial search, none of which can\nbe guaranteed to solve the problem exactly. Finding efficient algorithms to\nobtain an exact i.e. globally optimal solution for the 0-1 loss linear\nclassification problem with fixed dimension, remains an open problem. In\nresearch we report here, we detail the rigorous construction of a new\nalgorithm, incremental cell enumeration (ICE), that can solve the 0-1 loss\nclassification problem exactly in polynomial time. We prove correctness using\nconcepts from the theory of hyperplane arrangements and oriented matroids. We\ndemonstrate the effectiveness of this algorithm on synthetic and real-world\ndatasets, showing optimal accuracy both in and out-of-sample, in practical\ncomputational time. We also empirically demonstrate how the use of approximate\nupper bound leads to polynomial time run-time improvements to the algorithm\nwhilst retaining exactness. To our knowledge, this is the first,\nrigorously-proven polynomial time, practical algorithm for this long-standing\nproblem.\n","authors":["Xi He","Waheed Ul Rahman","Max A. Little"],"pdf_url":"https://arxiv.org/pdf/2306.12344v2.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.01438v1","updated":"2023-08-02T21:22:17Z","published":"2023-08-02T21:22:17Z","title":"Novel Physics-Based Machine-Learning Models for Indoor Air Quality\n  Approximations","summary":"  Cost-effective sensors are capable of real-time capturing a variety of air\nquality-related modalities from different pollutant concentrations to\nindoor/outdoor humidity and temperature. Machine learning (ML) models are\ncapable of performing air-quality \"ahead-of-time\" approximations. Undoubtedly,\naccurate indoor air quality approximation significantly helps provide a healthy\nindoor environment, optimize associated energy consumption, and offer human\ncomfort. However, it is crucial to design an ML architecture to capture the\ndomain knowledge, so-called problem physics. In this study, we propose six\nnovel physics-based ML models for accurate indoor pollutant concentration\napproximations. The proposed models include an adroit combination of\nstate-space concepts in physics, Gated Recurrent Units, and Decomposition\ntechniques. The proposed models were illustrated using data collected from five\noffices in a commercial building in California. The proposed models are shown\nto be less complex, computationally more efficient, and more accurate than\nsimilar state-of-the-art transformer-based models. The superiority of the\nproposed models is due to their relatively light architecture (computational\nefficiency) and, more importantly, their ability to capture the underlying\nhighly nonlinear patterns embedded in the often contaminated sensor-collected\nindoor air quality temporal data.\n","authors":["Ahmad Mohammadshirazi","Aida Nadafian","Amin Karimi Monsefi","Mohammad H. Rafiei","Rajiv Ramnath"],"pdf_url":"https://arxiv.org/pdf/2308.01438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01436v1","updated":"2023-08-02T21:16:05Z","published":"2023-08-02T21:16:05Z","title":"Price-Aware Deep Learning for Electricity Markets","summary":"  While deep learning gradually penetrates operational planning, its inherent\nprediction errors may significantly affect electricity prices. This letter\nexamines how prediction errors propagate into electricity prices, revealing\nnotable pricing errors and their spatial disparity in congested power systems.\nTo improve fairness, we propose to embed electricity market-clearing\noptimization as a deep learning layer. Differentiating through this layer\nallows for balancing between prediction and pricing errors, as oppose to\nminimizing prediction errors alone. This layer implicitly optimizes fairness\nand controls the spatial distribution of price errors across the system. We\nshowcase the price-aware deep learning in the nexus of wind power forecasting\nand short-term electricity market clearing.\n","authors":["Vladimir Dvorkin","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2308.01436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01433v1","updated":"2023-08-02T21:13:10Z","published":"2023-08-02T21:13:10Z","title":"COVID-VR: A Deep Learning COVID-19 Classification Model Using\n  Volume-Rendered Computer Tomography","summary":"  The COVID-19 pandemic presented numerous challenges to healthcare systems\nworldwide. Given that lung infections are prevalent among COVID-19 patients,\nchest Computer Tomography (CT) scans have frequently been utilized as an\nalternative method for identifying COVID-19 conditions and various other types\nof pulmonary diseases. Deep learning architectures have emerged to automate the\nidentification of pulmonary disease types by leveraging CT scan slices as\ninputs for classification models. This paper introduces COVID-VR, a novel\napproach for classifying pulmonary diseases based on volume rendering images of\nthe lungs captured from multiple angles, thereby providing a comprehensive view\nof the entire lung in each image. To assess the effectiveness of our proposal,\nwe compared it against competing strategies utilizing both private data\nobtained from partner hospitals and a publicly available dataset. The results\ndemonstrate that our approach effectively identifies pulmonary lesions and\nperforms competitively when compared to slice-based methods.\n","authors":["Noemi Maritza L. Romero","Ricco Vasconcellos","Mariana R. Mendoza","João L. D. Comba"],"pdf_url":"https://arxiv.org/pdf/2308.01433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12130v2","updated":"2023-08-02T20:49:29Z","published":"2023-04-24T14:33:34Z","title":"Reconstructing Turbulent Flows Using Physics-Aware Spatio-Temporal\n  Dynamics and Test-Time Refinement","summary":"  Simulating turbulence is critical for many societally important applications\nin aerospace engineering, environmental science, the energy industry, and\nbiomedicine. Large eddy simulation (LES) has been widely used as an alternative\nto direct numerical simulation (DNS) for simulating turbulent flows due to its\nreduced computational cost. However, LES is unable to capture all of the scales\nof turbulent transport accurately. Reconstructing DNS from low-resolution LES\nis critical for many scientific and engineering disciplines, but it poses many\nchallenges to existing super-resolution methods due to the spatio-temporal\ncomplexity of turbulent flows. In this work, we propose a new physics-guided\nneural network for reconstructing the sequential DNS from low-resolution LES\ndata. The proposed method leverages the partial differential equation that\nunderlies the flow dynamics in the design of spatio-temporal model\narchitecture. A degradation-based refinement method is also developed to\nenforce physical constraints and further reduce the accumulated reconstruction\nerrors over long periods. The results on two different types of turbulent flow\ndata confirm the superiority of the proposed method in reconstructing the\nhigh-resolution DNS data and preserving the physical characteristics of flow\ntransport.\n","authors":["Shengyu Chen","Tianshu Bao","Peyman Givi","Can Zheng","Xiaowei Jia"],"pdf_url":"https://arxiv.org/pdf/2304.12130v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2111.03110v2","updated":"2023-08-02T20:21:49Z","published":"2021-11-04T19:14:43Z","title":"Successor Feature Neural Episodic Control","summary":"  A longstanding goal in reinforcement learning is to build intelligent agents\nthat show fast learning and a flexible transfer of skills akin to humans and\nanimals. This paper investigates the integration of two frameworks for tackling\nthose goals: episodic control and successor features. Episodic control is a\ncognitively inspired approach relying on episodic memory, an instance-based\nmemory model of an agent's experiences. Meanwhile, successor features and\ngeneralized policy improvement (SF&GPI) is a meta and transfer learning\nframework allowing to learn policies for tasks that can be efficiently reused\nfor later tasks which have a different reward function. Individually, these two\ntechniques have shown impressive results in vastly improving sample efficiency\nand the elegant reuse of previously learned policies. Thus, we outline a\ncombination of both approaches in a single reinforcement learning framework and\nempirically illustrate its benefits.\n","authors":["David Emukpere","Xavier Alameda-Pineda","Chris Reinke"],"pdf_url":"https://arxiv.org/pdf/2111.03110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13773v2","updated":"2023-08-02T20:19:16Z","published":"2023-06-23T20:09:01Z","title":"Nearest Neighbour with Bandit Feedback","summary":"  In this paper we adapt the nearest neighbour rule to the contextual bandit\nproblem. Our algorithm handles the fully adversarial setting in which no\nassumptions at all are made about the data-generation process. When combined\nwith a sufficiently fast data-structure for (perhaps approximate) adaptive\nnearest neighbour search, such as a navigating net, our algorithm is extremely\nefficient - having a per trial running time polylogarithmic in both the number\nof trials and actions, and taking only quasi-linear space.\n","authors":["Stephen Pasteris","Chris Hicks","Vasilios Mavroudis"],"pdf_url":"https://arxiv.org/pdf/2306.13773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.11429v3","updated":"2023-08-02T19:10:31Z","published":"2022-12-22T00:43:06Z","title":"Automatically Bounding the Taylor Remainder Series: Tighter Bounds and\n  New Applications","summary":"  We present a new algorithm for automatically bounding the Taylor remainder\nseries. In the special case of a scalar function $f: \\mathbb{R} \\to\n\\mathbb{R}$, our algorithm takes as input a reference point $x_0$, trust region\n$[a, b]$, and integer $k \\ge 1$, and returns an interval $I$ such that $f(x) -\n\\sum_{i=0}^{k-1} \\frac {1} {i!} f^{(i)}(x_0) (x - x_0)^i \\in I (x - x_0)^k$ for\nall $x \\in [a, b]$. As in automatic differentiation, the function $f$ is\nprovided to the algorithm in symbolic form, and must be composed of known\natomic functions.\n  At a high level, our algorithm has two steps. First, for a variety of\ncommonly-used elementary functions (e.g., $\\exp$, $\\log$), we use\nrecently-developed theory to derive sharp polynomial upper and lower bounds on\nthe Taylor remainder series. We then recursively combine the bounds for the\nelementary functions using an interval arithmetic variant of Taylor-mode\nautomatic differentiation. Our algorithm can make efficient use of machine\nlearning hardware accelerators, and we provide an open source implementation in\nJAX.\n  We then turn our attention to applications. Most notably, in a companion\npaper we use our new machinery to create the first universal\nmajorization-minimization optimization algorithms: algorithms that iteratively\nminimize an arbitrary loss using a majorizer that is derived automatically,\nrather than by hand. We also show that our automatically-derived bounds can be\nused for verified global optimization and numerical integration, and to prove\nsharper versions of Jensen's inequality.\n","authors":["Matthew Streeter","Joshua V. Dillon"],"pdf_url":"https://arxiv.org/pdf/2212.11429v3.pdf","comment":"Previous version has been split into 3 articles: arXiv:2308.00679,\n  arXiv:2308.00190, and this article"},{"id":"http://arxiv.org/abs/2308.01390v1","updated":"2023-08-02T19:10:23Z","published":"2023-08-02T19:10:23Z","title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive\n  Vision-Language Models","summary":"  We introduce OpenFlamingo, a family of autoregressive vision-language models\nranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce\nan open-source replication of DeepMind's Flamingo models. On seven\nvision-language datasets, OpenFlamingo models average between 80 - 89% of\ncorresponding Flamingo performance. This technical report describes our models,\ntraining data, hyperparameters, and evaluation suite. We share our models and\ncode at https://github.com/mlfoundations/open_flamingo.\n","authors":["Anas Awadalla","Irena Gao","Josh Gardner","Jack Hessel","Yusuf Hanafy","Wanrong Zhu","Kalyani Marathe","Yonatan Bitton","Samir Gadre","Shiori Sagawa","Jenia Jitsev","Simon Kornblith","Pang Wei Koh","Gabriel Ilharco","Mitchell Wortsman","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.01390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01389v1","updated":"2023-08-02T19:08:57Z","published":"2023-08-02T19:08:57Z","title":"Follow the Soldiers with Optimized Single-Shot Multibox Detection and\n  Reinforcement Learning","summary":"  Nowadays, autonomous cars are gaining traction due to their numerous\npotential applications on battlefields and in resolving a variety of other\nreal-world challenges. The main goal of our project is to build an autonomous\nsystem using DeepRacer which will follow a specific person (for our project, a\nsoldier) when they will be moving in any direction. Two main components to\naccomplish this project is an optimized Single-Shot Multibox Detection (SSD)\nobject detection model and a Reinforcement Learning (RL) model. We accomplished\nthe task using SSD Lite instead of SSD and at the end, compared the results\namong SSD, SSD with Neural Computing Stick (NCS), and SSD Lite. Experimental\nresults show that SSD Lite gives better performance among these three\ntechniques and exhibits a considerable boost in inference speed (~2-3 times)\nwithout compromising accuracy.\n","authors":["Jumman Hossain","Maliha Momtaz"],"pdf_url":"https://arxiv.org/pdf/2308.01389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04370v5","updated":"2023-08-02T19:00:53Z","published":"2023-04-10T03:55:35Z","title":"OpenAGI: When LLM Meets Domain Experts","summary":"  Human intelligence excels at combining basic skills to solve complex tasks.\nThis capability is vital for Artificial Intelligence (AI) and should be\nembedded in comprehensive intelligent models, enabling them to harness expert\nmodels for complex task-solving towards Artificial General Intelligence (AGI).\nLarge Language Models (LLMs) show promising learning and reasoning abilities,\nand can effectively use external models, tools or APIs to tackle complex\nproblems. In this work, we introduce OpenAGI, an open-source AGI research\nplatform designed for multi-step, real-world tasks. Specifically, OpenAGI uses\na dual strategy, integrating standard benchmark tasks for benchmarking and\nevaluation, and open-ended tasks including more expandable models, tools or\nAPIs for creative problem-solving. Tasks are presented as natural language\nqueries to the LLM, which then selects and executes appropriate models. We also\npropose a Reinforcement Learning from Task Feedback (RLTF) mechanism that uses\ntask results to improve the LLM's ability, which creates a self-improving AI\nfeedback loop. While we acknowledge that AGI is a broad and multifaceted\nresearch challenge with no singularly defined solution path, the integration of\nLLMs with domain-specific expert models, inspired by mirroring the blend of\ngeneral and specialized intelligence in humans, offers a promising approach\ntowards AGI. We are open-sourcing the OpenAGI project's code, dataset,\nbenchmarks, evaluation methods, and demo to foster community involvement in AGI\nadvancement: https://github.com/agiresearch/OpenAGI.\n","authors":["Yingqiang Ge","Wenyue Hua","Kai Mei","Jianchao Ji","Juntao Tan","Shuyuan Xu","Zelong Li","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.04370v5.pdf","comment":"22 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2308.01320v1","updated":"2023-08-02T18:49:57Z","published":"2023-08-02T18:49:57Z","title":"DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like\n  Models at All Scales","summary":"  ChatGPT-like models have revolutionized various applications in artificial\nintelligence, from summarization and coding to translation, matching or even\nsurpassing human performance. However, the current landscape lacks an\naccessible, efficient, and cost-effective end-to-end RLHF (Reinforcement\nLearning with Human Feedback) training pipeline for these powerful models,\nparticularly when training at the scale of billions of parameters. This paper\nintroduces DeepSpeed-Chat, a novel system that democratizes RLHF training,\nmaking it accessible to the AI community. DeepSpeed-Chat offers three key\ncapabilities: an easy-to-use training and inference experience for ChatGPT-like\nmodels, a DeepSpeed-RLHF pipeline that replicates the training pipeline from\nInstructGPT, and a robust DeepSpeed-RLHF system that combines various\noptimizations for training and inference in a unified way. The system delivers\nunparalleled efficiency and scalability, enabling training of models with\nhundreds of billions of parameters in record time and at a fraction of the\ncost. With this development, DeepSpeed-Chat paves the way for broader access to\nadvanced RLHF training, even for data scientists with limited resources,\nthereby fostering innovation and further development in the field of AI.\n","authors":["Zhewei Yao","Reza Yazdani Aminabadi","Olatunji Ruwase","Samyam Rajbhandari","Xiaoxia Wu","Ammar Ahmad Awan","Jeff Rasley","Minjia Zhang","Conglong Li","Connor Holmes","Zhongzhu Zhou","Michael Wyatt","Molly Smith","Lev Kurilenko","Heyang Qin","Masahiro Tanaka","Shuai Che","Shuaiwen Leon Song","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2308.01320v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.04934v7","updated":"2023-08-02T18:49:09Z","published":"2023-04-11T02:12:02Z","title":"Model Sparsity Can Simplify Machine Unlearning","summary":"  In response to recent data regulation requirements, machine unlearning (MU)\nhas emerged as a critical process to remove the influence of specific examples\nfrom a given model. Although exact unlearning can be achieved through complete\nmodel retraining using the remaining dataset, the associated computational\ncosts have driven the development of efficient, approximate unlearning\ntechniques. Moving beyond data-centric MU approaches, our study introduces a\nnovel model-based perspective: model sparsification via weight pruning, which\nis capable of reducing the gap between exact unlearning and approximate\nunlearning. We show in both theory and practice that model sparsity can boost\nthe multi-criteria unlearning performance of an approximate unlearner, closing\nthe approximation gap, while continuing to be efficient. This leads to a new MU\nparadigm, termed prune first, then unlearn, which infuses a sparse model prior\ninto the unlearning process. Building on this insight, we also develop a\nsparsity-aware unlearning method that utilizes sparsity regularization to\nenhance the training process of approximate unlearning. Extensive experiments\nshow that our proposals consistently benefit MU in various unlearning\nscenarios. A notable highlight is the 77% unlearning efficacy gain of\nfine-tuning (one of the simplest unlearning methods) when using sparsity-aware\nunlearning. Furthermore, we demonstrate the practical impact of our proposed MU\nmethods in addressing other machine learning challenges, such as defending\nagainst backdoor attacks and enhancing transfer learning. Codes are available\nat https://github.com/OPTML-Group/Unlearn-Sparse.\n","authors":["Jinghan Jia","Jiancheng Liu","Parikshit Ram","Yuguang Yao","Gaowen Liu","Yang Liu","Pranay Sharma","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2304.04934v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01379v1","updated":"2023-08-02T18:36:54Z","published":"2023-08-02T18:36:54Z","title":"Computational Long Exposure Mobile Photography","summary":"  Long exposure photography produces stunning imagery, representing moving\nelements in a scene with motion-blur. It is generally employed in two\nmodalities, producing either a foreground or a background blur effect.\nForeground blur images are traditionally captured on a tripod-mounted camera\nand portray blurred moving foreground elements, such as silky water or light\ntrails, over a perfectly sharp background landscape. Background blur images,\nalso called panning photography, are captured while the camera is tracking a\nmoving subject, to produce an image of a sharp subject over a background\nblurred by relative motion. Both techniques are notoriously challenging and\nrequire additional equipment and advanced skills. In this paper, we describe a\ncomputational burst photography system that operates in a hand-held smartphone\ncamera app, and achieves these effects fully automatically, at the tap of the\nshutter button. Our approach first detects and segments the salient subject. We\ntrack the scene motion over multiple frames and align the images in order to\npreserve desired sharpness and to produce aesthetically pleasing motion\nstreaks. We capture an under-exposed burst and select the subset of input\nframes that will produce blur trails of controlled length, regardless of scene\nor camera motion velocity. We predict inter-frame motion and synthesize\nmotion-blur to fill the temporal gaps between the input frames. Finally, we\ncomposite the blurred image with the sharp regular exposure to protect the\nsharpness of faces or areas of the scene that are barely moving, and produce a\nfinal high resolution and high dynamic range (HDR) photograph. Our system\ndemocratizes a capability previously reserved to professionals, and makes this\ncreative style accessible to most casual photographers.\n  More information and supplementary material can be found on our project\nwebpage: https://motion-mode.github.io/\n","authors":["Eric Tabellion","Nikhil Karnad","Noa Glaser","Ben Weiss","David E. Jacobs","Yael Pritch"],"pdf_url":"https://arxiv.org/pdf/2308.01379v1.pdf","comment":"15 pages, 17 figures"},{"id":"http://arxiv.org/abs/2209.11883v2","updated":"2023-08-02T18:18:02Z","published":"2022-09-23T23:12:59Z","title":"Hebbian Deep Learning Without Feedback","summary":"  Recent approximations to backpropagation (BP) have mitigated many of BP's\ncomputational inefficiencies and incompatibilities with biology, but important\nlimitations still remain. Moreover, the approximations significantly decrease\naccuracy in benchmarks, suggesting that an entirely different approach may be\nmore fruitful. Here, grounded on recent theory for Hebbian learning in soft\nwinner-take-all networks, we present multilayer SoftHebb, i.e. an algorithm\nthat trains deep neural networks, without any feedback, target, or error\nsignals. As a result, it achieves efficiency by avoiding weight transport,\nnon-local plasticity, time-locking of layer updates, iterative equilibria, and\n(self-) supervisory or other feedback signals -- which were necessary in other\napproaches. Its increased efficiency and biological compatibility do not trade\noff accuracy compared to state-of-the-art bio-plausible learning, but rather\nimprove it. With up to five hidden layers and an added linear classifier,\naccuracies on MNIST, CIFAR-10, STL-10, and ImageNet, respectively reach 99.4%,\n80.3%, 76.2%, and 27.3%. In conclusion, SoftHebb shows with a radically\ndifferent approach from BP that Deep Learning over few layers may be plausible\nin the brain and increases the accuracy of bio-plausible machine learning. Code\nis available at https://github.com/NeuromorphicComputing/SoftHebb.\n","authors":["Adrien Journé","Hector Garcia Rodriguez","Qinghai Guo","Timoleon Moraitis"],"pdf_url":"https://arxiv.org/pdf/2209.11883v2.pdf","comment":"Updated to match the published version of the the ICLR 2023 paper\n  (notable-top 25%)"},{"id":"http://arxiv.org/abs/2308.01362v1","updated":"2023-08-02T18:08:27Z","published":"2023-08-02T18:08:27Z","title":"Explainable Deep Learning for Tumor Dynamic Modeling and Overall\n  Survival Prediction using Neural-ODE","summary":"  While tumor dynamic modeling has been widely applied to support the\ndevelopment of oncology drugs, there remains a need to increase predictivity,\nenable personalized therapy, and improve decision-making. We propose the use of\nTumor Dynamic Neural-ODE (TDNODE) as a pharmacology-informed neural network to\nenable model discovery from longitudinal tumor size data. We show that TDNODE\novercomes a key limitation of existing models in its ability to make unbiased\npredictions from truncated data. The encoder-decoder architecture is designed\nto express an underlying dynamical law which possesses the fundamental property\nof generalized homogeneity with respect to time. Thus, the modeling formalism\nenables the encoder output to be interpreted as kinetic rate metrics, with\ninverse time as the physical unit. We show that the generated metrics can be\nused to predict patients' overall survival (OS) with high accuracy. The\nproposed modeling formalism provides a principled way to integrate multimodal\ndynamical datasets in oncology disease modeling.\n","authors":["Mark Laurie","James Lu"],"pdf_url":"https://arxiv.org/pdf/2308.01362v1.pdf","comment":"23 pages, 4 figures and 2 tables"},{"id":"http://arxiv.org/abs/2308.01358v1","updated":"2023-08-02T18:02:00Z","published":"2023-08-02T18:02:00Z","title":"Compressed and distributed least-squares regression: convergence rates\n  with applications to Federated Learning","summary":"  In this paper, we investigate the impact of compression on stochastic\ngradient algorithms for machine learning, a technique widely used in\ndistributed and federated learning. We underline differences in terms of\nconvergence rates between several unbiased compression operators, that all\nsatisfy the same condition on their variance, thus going beyond the classical\nworst-case analysis. To do so, we focus on the case of least-squares regression\n(LSR) and analyze a general stochastic approximation algorithm for minimizing\nquadratic functions relying on a random field. We consider weak assumptions on\nthe random field, tailored to the analysis (specifically, expected H\\\"older\nregularity), and on the noise covariance, enabling the analysis of various\nrandomizing mechanisms, including compression. We then extend our results to\nthe case of federated learning.\n  More formally, we highlight the impact on the convergence of the covariance\n$\\mathfrak{C}_{\\mathrm{ania}}$ of the additive noise induced by the algorithm.\nWe demonstrate despite the non-regularity of the stochastic field, that the\nlimit variance term scales with $\\mathrm{Tr}(\\mathfrak{C}_{\\mathrm{ania}}\nH^{-1})/K$ (where $H$ is the Hessian of the optimization problem and $K$ the\nnumber of iterations) generalizing the rate for the vanilla LSR case where it\nis $\\sigma^2 \\mathrm{Tr}(H H^{-1}) / K = \\sigma^2 d / K$ (Bach and Moulines,\n2013). Then, we analyze the dependency of $\\mathfrak{C}_{\\mathrm{ania}}$ on the\ncompression strategy and ultimately its impact on convergence, first in the\ncentralized case, then in two heterogeneous FL frameworks.\n","authors":["Constantin Philippenko","Aymeric Dieuleveut"],"pdf_url":"https://arxiv.org/pdf/2308.01358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01329v1","updated":"2023-08-02T17:22:13Z","published":"2023-08-02T17:22:13Z","title":"EmbeddingTree: Hierarchical Exploration of Entity Features in Embedding","summary":"  Embedding learning transforms discrete data entities into continuous\nnumerical representations, encoding features/properties of the entities.\nDespite the outstanding performance reported from different embedding learning\nalgorithms, few efforts were devoted to structurally interpreting how features\nare encoded in the learned embedding space. This work proposes EmbeddingTree, a\nhierarchical embedding exploration algorithm that relates the semantics of\nentity features with the less-interpretable embedding vectors. An interactive\nvisualization tool is also developed based on EmbeddingTree to explore\nhigh-dimensional embeddings. The tool helps users discover nuance features of\ndata entities, perform feature denoising/injecting in embedding training, and\ngenerate embeddings for unseen entities. We demonstrate the efficacy of\nEmbeddingTree and our visualization tool through embeddings generated for\nindustry-scale merchant data and the public 30Music listening/playlists\ndataset.\n","authors":["Yan Zheng","Junpeng Wang","Chin-Chia Michael Yeh","Yujie Fan","Huiyuan Chen","Liang Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01329v1.pdf","comment":"5 pages, 3 figures, accepted by PacificVis 2023"},{"id":"http://arxiv.org/abs/2308.01327v1","updated":"2023-08-02T15:53:59Z","published":"2023-08-02T15:53:59Z","title":"Careful Whisper -- leveraging advances in automatic speech recognition\n  for robust and interpretable aphasia subtype classification","summary":"  This paper presents a fully automated approach for identifying speech\nanomalies from voice recordings to aid in the assessment of speech impairments.\nBy combining Connectionist Temporal Classification (CTC) and\nencoder-decoder-based automatic speech recognition models, we generate rich\nacoustic and clean transcripts. We then apply several natural language\nprocessing methods to extract features from these transcripts to produce\nprototypes of healthy speech. Basic distance measures from these prototypes\nserve as input features for standard machine learning classifiers, yielding\nhuman-level accuracy for the distinction between recordings of people with\naphasia and a healthy control group. Furthermore, the most frequently occurring\naphasia types can be distinguished with 90% accuracy. The pipeline is directly\napplicable to other diseases and languages, showing promise for robustly\nextracting diagnostic speech biomarkers.\n","authors":["Laurin Wagner","Mario Zusag","Theresa Bloder"],"pdf_url":"https://arxiv.org/pdf/2308.01327v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.00400v2","updated":"2023-08-02T13:53:30Z","published":"2023-08-01T09:28:36Z","title":"ZRIGF: An Innovative Multimodal Framework for Zero-Resource\n  Image-Grounded Dialogue Generation","summary":"  Image-grounded dialogue systems benefit greatly from integrating visual\ninformation, resulting in high-quality response generation. However, current\nmodels struggle to effectively utilize such information in zero-resource\nscenarios, mainly due to the disparity between image and text modalities. To\novercome this challenge, we propose an innovative multimodal framework, called\nZRIGF, which assimilates image-grounded information for dialogue generation in\nzero-resource situations. ZRIGF implements a two-stage learning strategy,\ncomprising contrastive pre-training and generative pre-training. Contrastive\npre-training includes a text-image matching module that maps images and texts\ninto a unified encoded vector space, along with a text-assisted masked image\nmodeling module that preserves pre-training visual features and fosters further\nmultimodal feature alignment. Generative pre-training employs a multimodal\nfusion module and an information transfer module to produce insightful\nresponses based on harmonized multimodal representations. Comprehensive\nexperiments conducted on both text-based and image-grounded dialogue datasets\ndemonstrate ZRIGF's efficacy in generating contextually pertinent and\ninformative responses. Furthermore, we adopt a fully zero-resource scenario in\nthe image-grounded dialogue dataset to demonstrate our framework's robust\ngeneralization capabilities in novel domains. The code is available at\nhttps://github.com/zhangbo-nlp/ZRIGF.\n","authors":["Bo Zhang","Jian Wang","Hui Ma","Bo Xu","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.00400v2.pdf","comment":"ACM Multimedia 2023 Accpeted, Repo:\n  https://github.com/zhangbo-nlp/ZRIGF"},{"id":"http://arxiv.org/abs/2308.01147v1","updated":"2023-08-02T13:43:03Z","published":"2023-08-02T13:43:03Z","title":"Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment\n  for Markup-to-Image Generation","summary":"  The recently rising markup-to-image generation poses greater challenges as\ncompared to natural image generation, due to its low tolerance for errors as\nwell as the complex sequence and context correlations between markup and\nrendered image. This paper proposes a novel model named \"Contrast-augmented\nDiffusion Model with Fine-grained Sequence Alignment\" (FSA-CDM), which\nintroduces contrastive positive/negative samples into the diffusion model to\nboost performance for markup-to-image generation. Technically, we design a\nfine-grained cross-modal alignment module to well explore the sequence\nsimilarity between the two modalities for learning robust feature\nrepresentations. To improve the generalization ability, we propose a\ncontrast-augmented diffusion model to explicitly explore positive and negative\nsamples by maximizing a novel contrastive variational objective, which is\nmathematically inferred to provide a tighter bound for the model's\noptimization. Moreover, the context-aware cross attention module is developed\nto capture the contextual information within markup language during the\ndenoising process, yielding better noise prediction results. Extensive\nexperiments are conducted on four benchmark datasets from different domains,\nand the experimental results demonstrate the effectiveness of the proposed\ncomponents in FSA-CDM, significantly exceeding state-of-the-art performance by\nabout 2%-12% DTW improvements. The code will be released at\nhttps://github.com/zgj77/FSACDM.\n","authors":["Guojin Zhong","Jin Yuan","Pan Wang","Kailun Yang","Weili Guan","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2308.01147v1.pdf","comment":"Accepted to ACM MM 2023. The code will be released at\n  https://github.com/zgj77/FSACDM"},{"id":"http://arxiv.org/abs/2308.01126v1","updated":"2023-08-02T13:09:57Z","published":"2023-08-02T13:09:57Z","title":"Beyond Generic: Enhancing Image Captioning with Real-World Knowledge\n  using Vision-Language Pre-Training Model","summary":"  Current captioning approaches tend to generate correct but \"generic\"\ndescriptions that lack real-world knowledge, e.g., named entities and\ncontextual information. Considering that Vision-Language Pre-Training (VLP)\nmodels master massive such knowledge from large-scale web-harvested data, it is\npromising to utilize the generalizability of VLP models to incorporate\nknowledge into image descriptions. However, using VLP models faces challenges:\nzero-shot inference suffers from knowledge hallucination that leads to\nlow-quality descriptions, but the generic bias in downstream task fine-tuning\nhinders the VLP model from expressing knowledge. To address these concerns, we\npropose a simple yet effective method called Knowledge-guided Replay\n(K-Replay), which enables the retention of pre-training knowledge during\nfine-tuning. Our approach consists of two parts: (1) a knowledge prediction\ntask on automatically collected replay exemplars to continuously awaken the VLP\nmodel's memory about knowledge, thus preventing the model from collapsing into\nthe generic pattern; (2) a knowledge distillation constraint to improve the\nfaithfulness of generated descriptions hence alleviating the knowledge\nhallucination. To evaluate knowledge-enhanced descriptions, we construct a\nnovel captioning benchmark KnowCap, containing knowledge of landmarks, famous\nbrands, special foods and movie characters. Experimental results show that our\napproach effectively incorporates knowledge into descriptions, outperforming\nstrong VLP baseline by 20.9 points (78.7->99.6) in CIDEr score and 20.5\npercentage points (34.0%->54.5%) in knowledge recognition accuracy. Our code\nand data is available at https://github.com/njucckevin/KnowCap.\n","authors":["Kanzhi Cheng","Wenpo Song","Zheng Ma","Wenhao Zhu","Zixuan Zhu","Jianbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01126v1.pdf","comment":"Accepted at ACM Multimedia (ACMMM) 2023"},{"id":"http://arxiv.org/abs/2308.00137v2","updated":"2023-08-02T07:34:05Z","published":"2023-07-31T20:09:25Z","title":"An Efficient Recommendation System in E-commerce using Passer learning\n  optimization based on Bi-LSTM","summary":"  Recommendation system services have become crucial for users to access\npersonalized goods or services as the global e-commerce market expands. They\ncan increase business sales growth and lower the cost of user information\nexploration. Recent years have seen a signifi-cant increase in researchers\nactively using user reviews to solve standard recommender system research\nissues. Reviews may, however, contain information that does not help consumers\nde-cide what to buy, such as advertising or fictitious or fake reviews. Using\nsuch reviews to offer suggestion services may reduce the effectiveness of those\nrecommendations. In this research, the recommendation in e-commerce is\ndeveloped using passer learning optimization based on Bi-LSTM to solve that\nissue (PL optimized Bi-LSTM). Data is first obtained from the product\nrecommendation dataset and pre-processed to remove any values that are missing\nor incon-sistent. Then, feature extraction is performed using TF-IDF features\nand features that support graph embedding. Before submitting numerous features\nwith the same dimensions to the Bi-LSTM classifier for analysis, they are\nintegrated using the feature concatenation approach. The Collaborative Bi-LSTM\nmethod employs these features to determine if the model is a recommended\nproduct. The PL optimization approach, which efficiently adjusts the\nclassifier's parameters and produces an extract output that measures the\nf1-score, MSE, precision, and recall, is the basis of this research's\ncontributions. As compared to earlier methods, the pro-posed PL-optimized\nBi-LSTM achieved values of 88.58%, 1.24%, 92.69%, and 92.69% for dataset 1,\n88.46%, 0.48%, 92.43%, and 93.47% for dataset 2, and 92.51%, 1.58%, 91.90%, and\n90.76% for dataset 3.\n","authors":["Hemn Barzan Abdalla","Awder Ahmed","Bahtiyar Mehmed","Mehdi Gheisari","Maryam Cheraghy"],"pdf_url":"https://arxiv.org/pdf/2308.00137v2.pdf","comment":null}]},"2023-08-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.01906v1","updated":"2023-08-03T17:59:27Z","published":"2023-08-03T17:59:27Z","title":"Reasoning in Large Language Models Through Symbolic Math Word Problems","summary":"  Large language models (LLMs) have revolutionized NLP by solving downstream\ntasks with little to no labeled data. Despite their versatile abilities, the\nlarger question of their ability to reason remains ill-understood. This paper\naddresses reasoning in math word problems (MWPs) by studying symbolic versions\nof the numeric problems, since a symbolic expression is a \"concise explanation\"\nof the numeric answer. We create and use a symbolic version of the SVAMP\ndataset and find that GPT-3's davinci-002 model also has good zero-shot\naccuracy on symbolic MWPs. To evaluate the faithfulness of the model's\nreasoning, we go beyond accuracy and additionally evaluate the alignment\nbetween the final answer and the outputted reasoning, which correspond to\nnumeric and symbolic answers respectively for MWPs. We explore a self-prompting\napproach to encourage the symbolic reasoning to align with the numeric answer,\nthus equipping the LLM with the ability to provide a concise and verifiable\nreasoning and making it more interpretable. Surprisingly, self-prompting also\nimproves the symbolic accuracy to be higher than both the numeric and symbolic\naccuracies, thus providing an ensembling effect. The SVAMP_Sym dataset will be\nreleased for future research on symbolic math problems.\n","authors":["Vedant Gaur","Nikunj Saunshi"],"pdf_url":"https://arxiv.org/pdf/2308.01906v1.pdf","comment":"Accepted at the Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2308.01899v1","updated":"2023-08-03T17:56:16Z","published":"2023-08-03T17:56:16Z","title":"How many preprints have actually been printed and why: a case study of\n  computer science preprints on arXiv","summary":"  Preprints play an increasingly critical role in academic communities. There\nare many reasons driving researchers to post their manuscripts to preprint\nservers before formal submission to journals or conferences, but the use of\npreprints has also sparked considerable controversy, especially surrounding the\nclaim of priority. In this paper, a case study of computer science preprints\nsubmitted to arXiv from 2008 to 2017 is conducted to quantify how many\npreprints have eventually been printed in peer-reviewed venues. Among those\npublished manuscripts, some are published under different titles and without an\nupdate to their preprints on arXiv. In the case of these manuscripts, the\ntraditional fuzzy matching method is incapable of mapping the preprint to the\nfinal published version. In view of this issue, we introduce a semantics-based\nmapping method with the employment of Bidirectional Encoder Representations\nfrom Transformers (BERT). With this new mapping method and a plurality of data\nsources, we find that 66% of all sampled preprints are published under\nunchanged titles and 11% are published under different titles and with other\nmodifications. A further analysis was then performed to investigate why these\npreprints but not others were accepted for publication. Our comparison reveals\nthat in the field of computer science, published preprints feature adequate\nrevisions, multiple authorship, detailed abstract and introduction, extensive\nand authoritative references and available source code.\n","authors":["Jialiang Lin","Yao Yu","Yu Zhou","Zhiyang Zhou","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2308.01899v1.pdf","comment":"Please cite the version of Scientometrics"},{"id":"http://arxiv.org/abs/2308.01887v1","updated":"2023-08-03T17:30:39Z","published":"2023-08-03T17:30:39Z","title":"Athena 2.0: Discourse and User Modeling in Open Domain Dialogue","summary":"  Conversational agents are consistently growing in popularity and many people\ninteract with them every day. While many conversational agents act as personal\nassistants, they can have many different goals. Some are task-oriented, such as\nproviding customer support for a bank or making a reservation. Others are\ndesigned to be empathetic and to form emotional connections with the user. The\nAlexa Prize Challenge aims to create a socialbot, which allows the user to\nengage in coherent conversations, on a range of popular topics that will\ninterest the user. Here we describe Athena 2.0, UCSC's conversational agent for\nAmazon's Socialbot Grand Challenge 4. Athena 2.0 utilizes a novel\nknowledge-grounded discourse model that tracks the entity links that Athena\nintroduces into the dialogue, and uses them to constrain named-entity\nrecognition and linking, and coreference resolution. Athena 2.0 also relies on\na user model to personalize topic selection and other aspects of the\nconversation to individual users.\n","authors":["Omkar Patil","Lena Reed","Kevin K. Bowden","Juraj Juraska","Wen Cui","Vrindavan Harrison","Rishi Rajasekaran","Angela Ramirez","Cecilia Li","Eduardo Zamora","Phillip Lee","Jeshwanth Bheemanpally","Rohan Pandey","Adwait Ratnaparkhi","Marilyn Walker"],"pdf_url":"https://arxiv.org/pdf/2308.01887v1.pdf","comment":"Alexa Prize Proceedings, 2021. Socialbot Grand Challenge 4"},{"id":"http://arxiv.org/abs/2308.01872v1","updated":"2023-08-03T16:53:53Z","published":"2023-08-03T16:53:53Z","title":"Thespian: Multi-Character Text Role-Playing Game Agents","summary":"  Text-adventure games and text role-playing games are grand challenges for\nreinforcement learning game playing agents. Text role-playing games are\nopen-ended environments where an agent must faithfully play a particular\ncharacter. We consider the distinction between characters and actors, where an\nactor agent has the ability to play multiple characters. We present a framework\nwe call a thespian agent that can learn to emulate multiple characters along\nwith a soft prompt that can be used to direct it as to which character to play\nat any time. We further describe an attention mechanism that allows the agent\nto learn new characters that are based on previously learned characters in a\nfew-shot fashion. We show that our agent outperforms the state of the art agent\nframework in multi-character learning and few-shot learning.\n","authors":["Christopher Cui","Xiangyu Peng","Mark Riedl"],"pdf_url":"https://arxiv.org/pdf/2308.01872v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.01863v1","updated":"2023-08-03T16:39:02Z","published":"2023-08-03T16:39:02Z","title":"Tag Prediction of Competitive Programming Problems using Deep Learning\n  Techniques","summary":"  In the past decade, the amount of research being done in the fields of\nmachine learning and deep learning, predominantly in the area of natural\nlanguage processing (NLP), has risen dramatically. A well-liked method for\ndeveloping programming abilities like logic building and problem solving is\ncompetitive programming. It can be tough for novices and even veteran\nprogrammers to traverse the wide collection of questions due to the massive\nnumber of accessible questions and the variety of themes, levels of difficulty,\nand questions offered. In order to help programmers find questions that are\nappropriate for their knowledge and interests, there is a need for an automated\nmethod. This can be done using automated tagging of the questions using Text\nClassification. Text classification is one of the important tasks widely\nresearched in the field of Natural Language Processing. In this paper, we\npresent a way to use text classification techniques to determine the domain of\na competitive programming problem. A variety of models, including are\nimplemented LSTM, GRU, and MLP. The dataset has been scraped from Codeforces, a\nmajor competitive programming website. A total of 2400 problems were scraped\nand preprocessed, which we used as a dataset for our training and testing of\nmodels. The maximum accuracy reached using our model is 78.0% by MLP(Multi\nLayer Perceptron).\n","authors":["Taha Lokat","Divyam Prajapati","Shubhada Labde"],"pdf_url":"https://arxiv.org/pdf/2308.01863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01862v1","updated":"2023-08-03T16:38:34Z","published":"2023-08-03T16:38:34Z","title":"Wider and Deeper LLM Networks are Fairer LLM Evaluators","summary":"  Measuring the quality of responses generated by LLMs is a challenging task,\nparticularly when it comes to evaluating whether the response is aligned with\nhuman preference. A novel approach involves using the LLM itself to make\nevaluation and stabilizing the results through multiple independent\nevaluations, similar to a single-layer narrow LLM network. This network\nconsists of a fixed number of neurons, with each neuron being the same LLM. In\nthis paper, we draw upon the extensive research on deep neural networks to\nexplore whether deeper and wider networks can lead to fairer evaluations.\nSpecifically, inspired by the observation that different neurons in a neural\nnetwork are responsible for detecting different concepts, we first adaptively\ngenerate as many neuron roles as possible for each evaluation sample. Each\nperspective corresponds to the role of a specific LLM neuron in the first\nlayer. In subsequent layers, we follow the idea that higher layers in deep\nnetworks are responsible for more comprehensive features, each layer receives\nrepresentations from all neurons in the previous layer, integrating the locally\nlearned evaluation information to obtain a more comprehensive evaluation\nresult. Interestingly, this network design resembles the process of academic\npaper reviewing. To validate the effectiveness of our method, we construct the\nlargest and most diverse English evaluation benchmark LLMEval$^2$ for LLM\nevaluators, comprising 15 tasks, 8 abilities, and 2,553 samples. Experimental\nresults demonstrate that a wider network (involving many reviewers) with 2\nlayers (one round of discussion) performs the best, improving kappa correlation\ncoefficient from 0.28 to 0.34. We also leverage WideDeep to aid in the\nassessment of Chinese LLMs, which has accelerated the evaluation time by 4.6\ntimes, resulting in a 60% cost saving. WideDeep achieves a remarkable 93%\nagreement level among humans.\n","authors":["Xinghua Zhang","Bowen Yu","Haiyang Yu","Yangyu Lv","Tingwen Liu","Fei Huang","Hongbo Xu","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2308.01862v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2308.01861v1","updated":"2023-08-03T16:31:02Z","published":"2023-08-03T16:31:02Z","title":"ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on\n  Class-level Code Generation","summary":"  In this work, we make the first attempt to evaluate LLMs in a more\nchallenging code generation scenario, i.e. class-level code generation. We\nfirst manually construct the first class-level code generation benchmark\nClassEval of 100 class-level Python code generation tasks with approximately\n500 person-hours. Based on it, we then perform the first study of 11\nstate-of-the-art LLMs on class-level code generation. Based on our results, we\nhave the following main findings. First, we find that all existing LLMs show\nmuch worse performance on class-level code generation compared to on standalone\nmethod-level code generation benchmarks like HumanEval; and the method-level\ncoding ability cannot equivalently reflect the class-level coding ability among\nLLMs. Second, we find that GPT-4 and GPT-3.5 still exhibit dominate superior\nthan other LLMs on class-level code generation, and the second-tier models\nincludes Instruct-Starcoder, Instruct-Codegen, and Wizardcoder with very\nsimilar performance. Third, we find that generating the entire class all at\nonce (i.e. holistic generation strategy) is the best generation strategy only\nfor GPT-4 and GPT-3.5, while method-by-method generation (i.e. incremental and\ncompositional) is better strategies for the other models with limited ability\nof understanding long instructions and utilizing the middle information.\nLastly, we find the limited model ability of generating method-dependent code\nand discuss the frequent error types in generated classes. Our benchmark is\navailable at https://github.com/FudanSELab/ClassEval.\n","authors":["Xueying Du","Mingwei Liu","Kaixin Wang","Hanlin Wang","Junwei Liu","Yixuan Chen","Jiayi Feng","Chaofeng Sha","Xin Peng","Yiling Lou"],"pdf_url":"https://arxiv.org/pdf/2308.01861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01849v1","updated":"2023-08-03T16:18:19Z","published":"2023-08-03T16:18:19Z","title":"Curricular Transfer Learning for Sentence Encoded Tasks","summary":"  Fine-tuning language models in a downstream task is the standard approach for\nmany state-of-the-art methodologies in the field of NLP. However, when the\ndistribution between the source task and target task drifts, \\textit{e.g.},\nconversational environments, these gains tend to be diminished. This article\nproposes a sequence of pre-training steps (a curriculum) guided by \"data\nhacking\" and grammar analysis that allows further gradual adaptation between\npre-training distributions. In our experiments, we acquire a considerable\nimprovement from our method compared to other known pre-training approaches for\nthe MultiWoZ task.\n","authors":["Jader Martins Camboim de Sá","Matheus Ferraroni Sanches","Rafael Roque de Souza","Júlio Cesar dos Reis","Leandro Aparecido Villas"],"pdf_url":"https://arxiv.org/pdf/2308.01849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01846v1","updated":"2023-08-03T16:13:05Z","published":"2023-08-03T16:13:05Z","title":"XNLP: An Interactive Demonstration System for Universal Structured NLP","summary":"  Structured Natural Language Processing (XNLP) is an important subset of NLP\nthat entails understanding the underlying semantic or syntactic structure of\ntexts, which serves as a foundational component for many downstream\napplications. Despite certain recent efforts to explore universal solutions for\nspecific categories of XNLP tasks, a comprehensive and effective approach for\nunifying all XNLP tasks long remains underdeveloped. In the meanwhile, while\nXNLP demonstration systems are vital for researchers exploring various XNLP\ntasks, existing platforms can be limited to, e.g., supporting few XNLP tasks,\nlacking interactivity and universalness. To this end, we propose an advanced\nXNLP demonstration platform, where we propose leveraging LLM to achieve\nuniversal XNLP, with one model for all with high generalizability. Overall, our\nsystem advances in multiple aspects, including universal XNLP modeling, high\nperformance, interpretability, scalability, and interactivity, providing a\nunified platform for exploring diverse XNLP tasks in the community. XNLP is\nonline: https://xnlp.haofei.vip\n","authors":["Hao Fei","Meishan Zhang","Min Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.01846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01834v1","updated":"2023-08-03T15:52:27Z","published":"2023-08-03T15:52:27Z","title":"The Capability of Large Language Models to Measure Psychiatric\n  Functioning","summary":"  The current work investigates the capability of Large language models (LLMs)\nthat are explicitly trained on large corpuses of medical knowledge (Med-PaLM 2)\nto predict psychiatric functioning from patient interviews and clinical\ndescriptions without being trained to do so. To assess this, n = 145 depression\nand n =115 PTSD assessments and n = 46 clinical case studies across high\nprevalence/high comorbidity disorders (Depressive, Anxiety, Psychotic, trauma\nand stress, Addictive disorders) were analyzed using prompts to extract\nestimated clinical scores and diagnoses. Results demonstrate that Med-PaLM 2 is\ncapable of assessing psychiatric functioning across a range of psychiatric\nconditions with the strongest performance being the prediction of depression\nscores based on standardized assessments (Accuracy range= 0.80 - 0.84) which\nwere statistically indistinguishable from human clinical raters t(1,144) =\n1.20; p = 0.23. Results show the potential for general clinical language models\nto flexibly predict psychiatric risk based on free descriptions of functioning\nfrom both patients and clinicians.\n","authors":["Isaac R. Galatzer-Levy","Daniel McDuff","Vivek Natarajan","Alan Karthikesalingam","Matteo Malgaroli"],"pdf_url":"https://arxiv.org/pdf/2308.01834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01831v1","updated":"2023-08-03T15:47:04Z","published":"2023-08-03T15:47:04Z","title":"Many-to-Many Spoken Language Translation via Unified Speech and Text\n  Representation Learning with Unit-to-Unit Translation","summary":"  In this paper, we propose a method to learn unified representations of\nmultilingual speech and text with a single model, especially focusing on the\npurpose of speech synthesis. We represent multilingual speech audio with speech\nunits, the quantized representations of speech features encoded from a\nself-supervised speech model. Therefore, we can focus on their linguistic\ncontent by treating the audio as pseudo text and can build a unified\nrepresentation of speech and text. Then, we propose to train an encoder-decoder\nstructured model with a Unit-to-Unit Translation (UTUT) objective on\nmultilingual data. Specifically, by conditioning the encoder with the source\nlanguage token and the decoder with the target language token, the model is\noptimized to translate the spoken language into that of the target language, in\na many-to-many language translation setting. Therefore, the model can build the\nknowledge of how spoken languages are comprehended and how to relate them to\ndifferent languages. A single pre-trained model with UTUT can be employed for\ndiverse multilingual speech- and text-related tasks, such as Speech-to-Speech\nTranslation (STS), multilingual Text-to-Speech Synthesis (TTS), and\nText-to-Speech Translation (TTST). By conducting comprehensive experiments\nencompassing various languages, we validate the efficacy of the proposed method\nacross diverse multilingual tasks. Moreover, we show UTUT can perform\nmany-to-many language STS, which has not been previously explored in the\nliterature. Samples are available on https://choijeongsoo.github.io/utut.\n","authors":["Minsu Kim","Jeongsoo Choi","Dahun Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.01831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01825v1","updated":"2023-08-03T15:34:01Z","published":"2023-08-03T15:34:01Z","title":"Scaling Relationship on Learning Mathematical Reasoning with Large\n  Language Models","summary":"  Mathematical reasoning is a challenging task for large language models\n(LLMs), while the scaling relationship of it with respect to LLM capacity is\nunder-explored. In this paper, we investigate how the pre-training loss,\nsupervised data amount, and augmented data amount influence the reasoning\nperformances of a supervised LLM. We find that pre-training loss is a better\nindicator of the model's performance than the model's parameter count. We apply\nsupervised fine-tuning (SFT) with different amounts of supervised data and\nempirically find a log-linear relation between data amount and model\nperformance, and we find better models improve less with enlarged supervised\ndatasets. To augment more data samples for improving model performances without\nany human effort, we propose to apply Rejection sampling Fine-Tuning (RFT). RFT\nuses supervised models to generate and collect correct reasoning paths as\naugmented fine-tuning datasets. We find with augmented samples containing more\ndistinct reasoning paths, RFT improves mathematical reasoning performance more\nfor LLMs. We also find RFT brings more improvement for less performant LLMs.\nFurthermore, we combine rejection samples from multiple models which push\nLLaMA-7B to an accuracy of 49.3% and outperforms the supervised fine-tuning\n(SFT) accuracy of 35.9% significantly.\n","authors":["Zheng Yuan","Hongyi Yuan","Chengpeng Li","Guanting Dong","Chuanqi Tan","Chang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.01825v1.pdf","comment":"Working in Progress"},{"id":"http://arxiv.org/abs/2304.06556v2","updated":"2023-08-03T15:31:50Z","published":"2023-04-13T14:03:14Z","title":"Are LLMs All You Need for Task-Oriented Dialogue?","summary":"  Instructions-tuned Large Language Models (LLMs) gained recently huge\npopularity thanks to their ability to interact with users through conversation.\nIn this work we aim to evaluate their ability to complete multi-turn tasks and\ninteract with external databases in the context of established task-oriented\ndialogue benchmarks. We show that for explicit belief state tracking, LLMs\nunderperform compared to specialized task-specific models. Nevertheless, they\nshow ability to guide the dialogue to successful ending if given correct slot\nvalues. Furthermore this ability improves with access to true belief state\ndistribution or in-domain examples.\n","authors":["Vojtěch Hudeček","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2304.06556v2.pdf","comment":"Accepted to SIGDial 2023"},{"id":"http://arxiv.org/abs/2306.06548v2","updated":"2023-08-03T15:26:55Z","published":"2023-06-11T00:23:25Z","title":"Inductive reasoning in humans and large language models","summary":"  The impressive recent performance of large language models has led many to\nwonder to what extent they can serve as models of general intelligence or are\nsimilar to human cognition. We address this issue by applying GPT-3.5 and GPT-4\nto a classic problem in human inductive reasoning known as property induction.\nOver two experiments, we elicit human judgments on a range of property\ninduction tasks spanning multiple domains. Although GPT-3.5 struggles to\ncapture many aspects of human behaviour, GPT-4 is much more successful: for the\nmost part, its performance qualitatively matches that of humans, and the only\nnotable exception is its failure to capture the phenomenon of premise\nnon-monotonicity. Our work demonstrates that property induction allows for\ninteresting comparisons between human and machine intelligence and provides two\nlarge datasets that can serve as benchmarks for future work in this vein.\n","authors":["Simon J. Han","Keith Ransom","Andrew Perfors","Charles Kemp"],"pdf_url":"https://arxiv.org/pdf/2306.06548v2.pdf","comment":"61 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.02897v2","updated":"2023-08-03T14:33:37Z","published":"2023-05-04T15:07:20Z","title":"An automatically discovered chain-of-thought prompt generalizes to novel\n  models and datasets","summary":"  Emergent chain-of-thought (CoT) reasoning capabilities promise to improve\nperformance and explainability of large language models (LLMs). However,\nuncertainties remain about how reasoning strategies formulated for previous\nmodel generations generalize to new model generations and different datasets.\nIn this small-scale study, we compare different reasoning strategies induced by\nzero-shot prompting across six recently released LLMs (davinci-002,\ndavinci-003, GPT-3.5-turbo, GPT-4, Flan-T5-xxl and Cohere command-xlarge) on a\nmixture of six question-answering datasets, including datasets from scientific\nand medical domains. Our findings demonstrate that while some variations in\neffectiveness occur, gains from CoT reasoning strategies remain robust across\ndifferent models and datasets. GPT-4 has the most benefit from current\nstate-of-the-art reasoning strategies and exhibits the best performance by\napplying a prompt previously discovered through automated discovery.\n","authors":["Konstantin Hebenstreit","Robert Praas","Louis P Kiesewetter","Matthias Samwald"],"pdf_url":"https://arxiv.org/pdf/2305.02897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01785v1","updated":"2023-08-03T14:31:57Z","published":"2023-08-03T14:31:57Z","title":"Lexicon and Rule-based Word Lemmatization Approach for the Somali\n  Language","summary":"  Lemmatization is a Natural Language Processing (NLP) technique used to\nnormalize text by changing morphological derivations of words to their root\nforms. It is used as a core pre-processing step in many NLP tasks including\ntext indexing, information retrieval, and machine learning for NLP, among\nothers. This paper pioneers the development of text lemmatization for the\nSomali language, a low-resource language with very limited or no prior\neffective adoption of NLP methods and datasets. We especially develop a lexicon\nand rule-based lemmatizer for Somali text, which is a starting point for a\nfull-fledged Somali lemmatization system for various NLP tasks. With\nconsideration of the language morphological rules, we have developed an initial\nlexicon of 1247 root words and 7173 derivationally related terms enriched with\nrules for lemmatizing words not present in the lexicon. We have tested the\nalgorithm on 120 documents of various lengths including news articles, social\nmedia posts, and text messages. Our initial results demonstrate that the\nalgorithm achieves an accuracy of 57\\% for relatively long documents (e.g. full\nnews articles), 60.57\\% for news article extracts, and high accuracy of 95.87\\%\nfor short texts such as social media messages.\n","authors":["Shafie Abdi Mohamed","Muhidin Abdullahi Mohamed"],"pdf_url":"https://arxiv.org/pdf/2308.01785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01776v1","updated":"2023-08-03T14:09:31Z","published":"2023-08-03T14:09:31Z","title":"Does Correction Remain An Problem For Large Language Models?","summary":"  As large language models, such as GPT, continue to advance the capabilities\nof natural language processing (NLP), the question arises: does the problem of\ncorrection still persist? This paper investigates the role of correction in the\ncontext of large language models by conducting two experiments. The first\nexperiment focuses on correction as a standalone task, employing few-shot\nlearning techniques with GPT-like models for error correction. The second\nexperiment explores the notion of correction as a preparatory task for other\nNLP tasks, examining whether large language models can tolerate and perform\nadequately on texts containing certain levels of noise or errors. By addressing\nthese experiments, we aim to shed light on the significance of correction in\nthe era of large language models and its implications for various NLP\napplications.\n","authors":["Xiaowu Zhang","Xiaotian Zhang","Cheng Yang","Hang Yan","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.01776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01741v1","updated":"2023-08-03T13:06:37Z","published":"2023-08-03T13:06:37Z","title":"Supply chain emission estimation using large language models","summary":"  Large enterprises face a crucial imperative to achieve the Sustainable\nDevelopment Goals (SDGs), especially goal 13, which focuses on combating\nclimate change and its impacts. To mitigate the effects of climate change,\nreducing enterprise Scope 3 (supply chain emissions) is vital, as it accounts\nfor more than 90\\% of total emission inventories. However, tracking Scope 3\nemissions proves challenging, as data must be collected from thousands of\nupstream and downstream suppliers.To address the above mentioned challenges, we\npropose a first-of-a-kind framework that uses domain-adapted NLP foundation\nmodels to estimate Scope 3 emissions, by utilizing financial transactions as a\nproxy for purchased goods and services. We compared the performance of the\nproposed framework with the state-of-art text classification models such as\nTF-IDF, word2Vec, and Zero shot learning. Our results show that the\ndomain-adapted foundation model outperforms state-of-the-art text mining\ntechniques and performs as well as a subject matter expert (SME). The proposed\nframework could accelerate the Scope 3 estimation at Enterprise scale and will\nhelp to take appropriate climate actions to achieve SDG 13.\n","authors":["Ayush Jain","Manikandan Padmanaban","Jagabondhu Hazra","Shantanu Godbole","Kommy Weldemariam"],"pdf_url":"https://arxiv.org/pdf/2308.01741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01734v1","updated":"2023-08-03T12:52:49Z","published":"2023-08-03T12:52:49Z","title":"Ambient Adventures: Teaching ChatGPT on Developing Complex Stories","summary":"  Imaginative play is an area of creativity that could allow robots to engage\nwith the world around them in a much more personified way. Imaginary play can\nbe seen as taking real objects and locations and using them as imaginary\nobjects and locations in virtual scenarios. We adopted the story generation\ncapability of large language models (LLMs) to obtain the stories used for\nimaginary play with human-written prompts. Those generated stories will be\nsimplified and mapped into action sequences that can guide the agent in\nimaginary play. To evaluate whether the agent can successfully finish the\nimaginary play, we also designed a text adventure game to simulate a house as\nthe playground for the agent to interact.\n","authors":["Zexin Chen","Eric Zhou","Kenneth Eaton","Xiangyu Peng","Mark Riedl"],"pdf_url":"https://arxiv.org/pdf/2308.01734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01727v1","updated":"2023-08-03T12:36:13Z","published":"2023-08-03T12:36:13Z","title":"Local Large Language Models for Complex Structured Medical Tasks","summary":"  This paper introduces an approach that combines the language reasoning\ncapabilities of large language models (LLMs) with the benefits of local\ntraining to tackle complex, domain-specific tasks. Specifically, the authors\ndemonstrate their approach by extracting structured condition codes from\npathology reports. The proposed approach utilizes local LLMs, which can be\nfine-tuned to respond to specific generative instructions and provide\nstructured outputs. The authors collected a dataset of over 150k uncurated\nsurgical pathology reports, containing gross descriptions, final diagnoses, and\ncondition codes. They trained different model architectures, including LLaMA,\nBERT and LongFormer and evaluated their performance. The results show that the\nLLaMA-based models significantly outperform BERT-style models across all\nevaluated metrics, even with extremely reduced precision. The LLaMA models\nperformed especially well with large datasets, demonstrating their ability to\nhandle complex, multi-label tasks. Overall, this work presents an effective\napproach for utilizing LLMs to perform domain-specific tasks using accessible\nhardware, with potential applications in the medical domain, where complex data\nextraction and classification are required.\n","authors":["V. K. Cody Bumgardner","Aaron Mullen","Sam Armstrong","Caylin Hickey","Jeff Talbert"],"pdf_url":"https://arxiv.org/pdf/2308.01727v1.pdf","comment":"12 pages, Preprint of an article submitted for consideration in\n  Pacific Symposium on Biocomputing \\c{opyright} 2024 copyright World\n  Scientific Publishing Company https://www.worldscientific.com/"},{"id":"http://arxiv.org/abs/2308.01684v1","updated":"2023-08-03T10:52:52Z","published":"2023-08-03T10:52:52Z","title":"Baby's CoThought: Leveraging Large Language Models for Enhanced\n  Reasoning in Compact Models","summary":"  Large Language Models (LLMs) demonstrate remarkable performance on a variety\nof Natural Language Understanding (NLU) tasks, primarily due to their\nin-context learning ability. This ability is utilized in our proposed\n\"CoThought\" pipeline, which efficiently trains smaller \"baby\" language models\n(BabyLMs) by leveraging the Chain of Thought (CoT) prompting of LLMs. Our\npipeline restructures a dataset of less than 100M in size using GPT-3.5-turbo,\ntransforming it into task-oriented, human-readable texts that are comparable to\nthe school texts for language learners. The BabyLM is then pretrained on this\nrestructured dataset in a RoBERTa (Liu et al., 2019) fashion. In evaluations\nacross 4 benchmarks, our BabyLM outperforms the RoBERTa-base in 10 linguistic,\nNLU, and question answering tasks by more than 3 points, showing superior\nability to extract contextual information. These results suggest that compact\nLMs pretrained on small, LLM-restructured data can better understand tasks and\nachieve improved performance. The code for data processing and model training\nis available at: https://github.com/oooranz/Baby-CoThought.\n","authors":["Zheyu Zhang","Han Yang","Bolei Ma","David Rügamer","Ercong Nie"],"pdf_url":"https://arxiv.org/pdf/2308.01684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01681v1","updated":"2023-08-03T10:48:30Z","published":"2023-08-03T10:48:30Z","title":"NBIAS: A Natural Language Processing Framework for Bias Identification\n  in Text","summary":"  Bias in textual data can lead to skewed interpretations and outcomes when the\ndata is used. These biases could perpetuate stereotypes, discrimination, or\nother forms of unfair treatment. An algorithm trained on biased data ends up\nmaking decisions that disproportionately impact a certain group of people.\nTherefore, it is crucial to detect and remove these biases to ensure the fair\nand ethical use of data. To this end, we develop a comprehensive and robust\nframework \\textsc{Nbias} that consists of a data layer, corpus contruction,\nmodel development layer and an evaluation layer. The dataset is constructed by\ncollecting diverse data from various fields, including social media,\nhealthcare, and job hiring portals. As such, we applied a transformer-based\ntoken classification model that is able to identify bias words/ phrases through\na unique named entity. In the assessment procedure, we incorporate a blend of\nquantitative and qualitative evaluations to gauge the effectiveness of our\nmodels. We achieve accuracy improvements ranging from 1% to 8% compared to\nbaselines. We are also able to generate a robust understanding of the model\nfunctioning, capturing not only numerical data but also the quality and\nintricacies of its performance. The proposed approach is applicable to a\nvariety of biases and contributes to the fair and ethical use of textual data.\n","authors":["Shaina Razaa","Muskan Garg","Deepak John Reji","Syed Raza Bashir","Chen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.01681v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.01666v1","updated":"2023-08-03T10:11:42Z","published":"2023-08-03T10:11:42Z","title":"Evaluating ChatGPT text-mining of clinical records for obesity\n  monitoring","summary":"  Background: Veterinary clinical narratives remain a largely untapped resource\nfor addressing complex diseases. Here we compare the ability of a large\nlanguage model (ChatGPT) and a previously developed regular expression (RegexT)\nto identify overweight body condition scores (BCS) in veterinary narratives.\nMethods: BCS values were extracted from 4,415 anonymised clinical narratives\nusing either RegexT or by appending the narrative to a prompt sent to ChatGPT\ncoercing the model to return the BCS information. Data were manually reviewed\nfor comparison. Results: The precision of RegexT was higher (100%, 95% CI\n94.81-100%) than the ChatGPT (89.3%; 95% CI82.75-93.64%). However, the recall\nof ChatGPT (100%. 95% CI 96.18-100%) was considerably higher than that of\nRegexT (72.6%, 95% CI 63.92-79.94%). Limitations: Subtle prompt engineering is\nneeded to improve ChatGPT output. Conclusions: Large language models create\ndiverse opportunities and, whilst complex, present an intuitive interface to\ninformation but require careful implementation to avoid unpredictable errors.\n","authors":["Ivo S. Fins","Heather Davies","Sean Farrell","Jose R. Torres","Gina Pinchbeck","Alan D. Radford","Peter-John Noble"],"pdf_url":"https://arxiv.org/pdf/2308.01666v1.pdf","comment":"Supplementary Material: The data that support the findings of this\n  study are available in the ancillary files of this submission. 5 pages, 2\n  figures (textboxes)"},{"id":"http://arxiv.org/abs/2212.04385v2","updated":"2023-08-03T09:39:00Z","published":"2022-12-08T16:27:54Z","title":"BEVBert: Multimodal Map Pre-training for Language-guided Navigation","summary":"  Large-scale pre-training has shown promising results on the\nvision-and-language navigation (VLN) task. However, most existing pre-training\nmethods employ discrete panoramas to learn visual-textual associations. This\nrequires the model to implicitly correlate incomplete, duplicate observations\nwithin the panoramas, which may impair an agent's spatial understanding. Thus,\nwe propose a new map-based pre-training paradigm that is spatial-aware for use\nin VLN. Concretely, we build a local metric map to explicitly aggregate\nincomplete observations and remove duplicates, while modeling navigation\ndependency in a global topological map. This hybrid design can balance the\ndemand of VLN for both short-term reasoning and long-term planning. Then, based\non the hybrid map, we devise a pre-training framework to learn a multimodal map\nrepresentation, which enhances spatial-aware cross-modal reasoning thereby\nfacilitating the language-guided navigation goal. Extensive experiments\ndemonstrate the effectiveness of the map-based pre-training route for VLN, and\nthe proposed method achieves state-of-the-art on four VLN benchmarks.\n","authors":["Dong An","Yuankai Qi","Yangguang Li","Yan Huang","Liang Wang","Tieniu Tan","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2212.04385v2.pdf","comment":"ICCV 2023, project page: https://github.com/MarSaKi/VLN-BEVBert"},{"id":"http://arxiv.org/abs/2307.15002v4","updated":"2023-08-03T09:34:34Z","published":"2023-07-27T16:57:32Z","title":"Gzip versus bag-of-words for text classification with KNN","summary":"  The effectiveness of compression distance in KNN-based text classification\n('gzip') has recently garnered lots of attention. In this note we show that\nsimpler means can also be effective, and compression may not be needed. Indeed,\na 'bag-of-words' matching can achieve similar or better results, and is more\nefficient.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2307.15002v4.pdf","comment":"improved writing"},{"id":"http://arxiv.org/abs/2305.12726v2","updated":"2023-08-03T09:26:36Z","published":"2023-05-22T05:20:23Z","title":"Towards Explainable In-the-Wild Video Quality Assessment: A Database and\n  a Language-Prompted Approach","summary":"  The proliferation of in-the-wild videos has greatly expanded the Video\nQuality Assessment (VQA) problem. Unlike early definitions that usually focus\non limited distortion types, VQA on in-the-wild videos is especially\nchallenging as it could be affected by complicated factors, including various\ndistortions and diverse contents. Though subjective studies have collected\noverall quality scores for these videos, how the abstract quality scores relate\nwith specific factors is still obscure, hindering VQA methods from more\nconcrete quality evaluations (e.g. sharpness of a video). To solve this\nproblem, we collect over two million opinions on 4,543 in-the-wild videos on 13\ndimensions of quality-related factors, including in-capture authentic\ndistortions (e.g. motion blur, noise, flicker), errors introduced by\ncompression and transmission, and higher-level experiences on semantic contents\nand aesthetic issues (e.g. composition, camera trajectory), to establish the\nmulti-dimensional Maxwell database. Specifically, we ask the subjects to label\namong a positive, a negative, and a neutral choice for each dimension. These\nexplanation-level opinions allow us to measure the relationships between\nspecific quality factors and abstract subjective quality ratings, and to\nbenchmark different categories of VQA algorithms on each dimension, so as to\nmore comprehensively analyze their strengths and weaknesses. Furthermore, we\npropose the MaxVQA, a language-prompted VQA approach that modifies\nvision-language foundation model CLIP to better capture important quality\nissues as observed in our analyses. The MaxVQA can jointly evaluate various\nspecific quality factors and final quality scores with state-of-the-art\naccuracy on all dimensions, and superb generalization ability on existing\ndatasets. Code and data available at https://github.com/VQAssessment/MaxVQA.\n","authors":["Haoning Wu","Erli Zhang","Liang Liao","Chaofeng Chen","Jingwen Hou","Annan Wang","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2305.12726v2.pdf","comment":"Proceedings of the 31st ACM International Conference on Multimedia\n  (MM '23)"},{"id":"http://arxiv.org/abs/2308.01589v1","updated":"2023-08-03T07:48:02Z","published":"2023-08-03T07:48:02Z","title":"Holy Grail 2.0: From Natural Language to Constraint Models","summary":"  Twenty-seven years ago, E. Freuder highlighted that \"Constraint programming\nrepresents one of the closest approaches computer science has yet made to the\nHoly Grail of programming: the user states the problem, the computer solves\nit\". Nowadays, CP users have great modeling tools available (like Minizinc and\nCPMpy), allowing them to formulate the problem and then let a solver do the\nrest of the job, getting closer to the stated goal. However, this still\nrequires the CP user to know the formalism and respect it. Another significant\nchallenge lies in the expertise required to effectively model combinatorial\nproblems. All this limits the wider adoption of CP. In this position paper, we\ninvestigate a possible approach to leverage pre-trained Large Language Models\nto extract models from textual problem descriptions. More specifically, we take\ninspiration from the Natural Language Processing for Optimization (NL4OPT)\nchallenge and present early results with a decomposition-based prompting\napproach to GPT Models.\n","authors":["Dimos Tsouros","Hélène Verhaeghe","Serdar Kadıoğlu","Tias Guns"],"pdf_url":"https://arxiv.org/pdf/2308.01589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01552v1","updated":"2023-08-03T06:19:58Z","published":"2023-08-03T06:19:58Z","title":"InterAct: Exploring the Potentials of ChatGPT as a Cooperative Agent","summary":"  This research paper delves into the integration of OpenAI's ChatGPT into\nembodied agent systems, evaluating its influence on interactive decision-making\nbenchmark. Drawing a parallel to the concept of people assuming roles according\nto their unique strengths, we introduce InterAct. In this approach, we feed\nChatGPT with varied prompts, assigning it a numerous roles like a checker and a\nsorter, then integrating them with the original language model. Our research\nshows a remarkable success rate of 98% in AlfWorld, which consists of 6\ndifferent tasks in a simulated household environment, emphasizing the\nsignificance of proficient prompt engineering. The results highlight ChatGPT's\ncompetence in comprehending and performing intricate tasks effectively in\nreal-world settings, thus paving the way for further advancements in task\nplanning.\n","authors":["Po-Lin Chen","Cheng-Shang Chang"],"pdf_url":"https://arxiv.org/pdf/2308.01552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01544v1","updated":"2023-08-03T05:27:12Z","published":"2023-08-03T05:27:12Z","title":"Multimodal Neurons in Pretrained Text-Only Transformers","summary":"  Language models demonstrate remarkable capacity to generalize representations\nlearned in one modality to downstream tasks in other modalities. Can we trace\nthis ability to individual neurons? We study the case where a frozen text\ntransformer is augmented with vision using a self-supervised visual encoder and\na single linear projection learned on an image-to-text task. Outputs of the\nprojection layer are not immediately decodable into language describing image\ncontent; instead, we find that translation between modalities occurs deeper\nwithin the transformer. We introduce a procedure for identifying \"multimodal\nneurons\" that convert visual representations into corresponding text, and\ndecoding the concepts they inject into the model's residual stream. In a series\nof experiments, we show that multimodal neurons operate on specific visual\nconcepts across inputs, and have a systematic causal effect on image\ncaptioning.\n","authors":["Sarah Schwettmann","Neil Chowdhury","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2308.01544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01535v1","updated":"2023-08-03T04:35:46Z","published":"2023-08-03T04:35:46Z","title":"Comparing scalable strategies for generating numerical perspectives","summary":"  Numerical perspectives help people understand extreme and unfamiliar numbers\n(e.g., \\$330 billion is about \\$1,000 per person in the United States). While\nresearch shows perspectives to be helpful, generating them at scale is\nchallenging both because it is difficult to identify what makes some analogies\nmore helpful than others, and because what is most helpful can vary based on\nthe context in which a given number appears. Here we present and compare three\npolicies for large-scale perspective generation: a rule-based approach, a\ncrowdsourced system, and a model that uses Wikipedia data and semantic\nsimilarity (via BERT embeddings) to generate context-specific perspectives. We\nfind that the combination of these three approaches dominates any single\nmethod, with different approaches excelling in different settings and users\ndisplaying heterogeneous preferences across approaches. We conclude by\ndiscussing our deployment of perspectives in a widely-used online word\nprocessor.\n","authors":["Hancheng Cao","Sofia Eleni Spatharioti","Daniel G. Goldstein","Jake M. Hofman"],"pdf_url":"https://arxiv.org/pdf/2308.01535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v2","updated":"2023-08-03T03:23:25Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n  A Comprehensive Survey","summary":"  Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v2.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2308.01497v1","updated":"2023-08-03T01:46:27Z","published":"2023-08-03T01:46:27Z","title":"Large Language Model Displays Emergent Ability to Interpret Novel\n  Literary Metaphors","summary":"  Recent advances in the performance of large language models (LLMs) have\nsparked debate over whether, given sufficient training, high-level human\nabilities emerge in such generic forms of artificial intelligence (AI). Despite\nthe exceptional performance of LLMs on a wide range of tasks involving natural\nlanguage processing and reasoning, there has been sharp disagreement as to\nwhether their abilities extend to more creative human abilities. A core example\nis the ability to interpret novel metaphors. Given the enormous and non-curated\ntext corpora used to train LLMs, a serious obstacle to designing tests is the\nrequirement of finding novel yet high-quality metaphors that are unlikely to\nhave been included in the training data. Here we assessed the ability of GPT-4,\na state-of-the-art large language model, to provide natural-language\ninterpretations of novel literary metaphors drawn from Serbian poetry and\ntranslated into English. Despite exhibiting no signs of having been exposed to\nthese metaphors previously, the AI system consistently produced detailed and\nincisive interpretations. Human judge - blind to the fact that an AI model was\ninvolved - rated metaphor interpretations generated by GPT-4 as superior to\nthose provided by a group of college students. In interpreting reversed\nmetaphors, GPT-4, as well as humans, exhibited signs of sensitivity to the\nGricean cooperative principle. These results indicate that LLMs such as GPT-4\nhave acquired an emergent ability to interpret complex novel metaphors.\n","authors":["Nicholas Ichien","Dušan Stamenković","Keith J. Holyoak"],"pdf_url":"https://arxiv.org/pdf/2308.01497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15426v2","updated":"2023-08-03T00:40:58Z","published":"2022-11-24T12:44:26Z","title":"AI Knows Which Words Will Appear in Next Year's Korean CSAT","summary":"  A text-mining-based word class categorization method and LSTM-based\nvocabulary pattern prediction method are introduced in this paper. A\npreprocessing method based on simple text appearance frequency analysis is\nfirst described. This method was developed as a data screening tool but showed\n4.35 ~ 6.21 times higher than previous works. An LSTM deep learning method is\nalso suggested for vocabulary appearance pattern prediction method. AI performs\na regression with various size of data window of previous exams to predict the\nprobabilities of word appearance in the next exam. Predicted values of AI over\nvarious data windows are processed into a single score as a weighted sum, which\nwe call an \"AI-Score\", which represents the probability of word appearance in\nnext year's exam. Suggested method showed 100% accuracy at the range 100-score\narea and showed only 1.7% error of prediction in the section where the scores\nwere over 60 points. All source codes are freely available at the authors' Git\nHub repository. (https://github.com/needleworm/bigdata_voca)\n","authors":["Byunghyun Ban","Jejong Lee","Hyeonmok Hwang"],"pdf_url":"https://arxiv.org/pdf/2211.15426v2.pdf","comment":"update additional experiment result"},{"id":"http://arxiv.org/abs/2308.01479v1","updated":"2023-08-03T00:10:23Z","published":"2023-08-03T00:10:23Z","title":"Investigating Reinforcement Learning for Communication Strategies in a\n  Task-Initiative Setting","summary":"  Many conversational domains require the system to present nuanced information\nto users. Such systems must follow up what they say to address clarification\nquestions and repair misunderstandings. In this work, we explore this\ninteractive strategy in a referential communication task. Using simulation, we\nanalyze the communication trade-offs between initial presentation and\nsubsequent followup as a function of user clarification strategy, and compare\nthe performance of several baseline strategies to policies derived by\nreinforcement learning. We find surprising advantages to coherence-based\nrepresentations of dialogue strategy, which bring minimal data requirements,\nexplainable choices, and strong audit capabilities, but incur little loss in\npredicted outcomes across a wide range of user models.\n","authors":["Baber Khalid","Matthew Stone"],"pdf_url":"https://arxiv.org/pdf/2308.01479v1.pdf","comment":"Peer-reviewed and Published at IWSDS 2023"},{"id":"http://arxiv.org/abs/2308.02080v1","updated":"2023-08-03T23:39:03Z","published":"2023-08-03T23:39:03Z","title":"Causality Guided Disentanglement for Cross-Platform Hate Speech\n  Detection","summary":"  Social media platforms, despite their value in promoting open discourse, are\noften exploited to spread harmful content. Current deep learning and natural\nlanguage processing models used for detecting this harmful content overly rely\non domain-specific terms affecting their capabilities to adapt to generalizable\nhate speech detection. This is because they tend to focus too narrowly on\nparticular linguistic signals or the use of certain categories of words.\nAnother significant challenge arises when platforms lack high-quality annotated\ndata for training, leading to a need for cross-platform models that can adapt\nto different distribution shifts. Our research introduces a cross-platform hate\nspeech detection model capable of being trained on one platform's data and\ngeneralizing to multiple unseen platforms. To achieve good generalizability\nacross platforms, one way is to disentangle the input representations into\ninvariant and platform-dependent features. We also argue that learning causal\nrelationships, which remain constant across diverse environments, can\nsignificantly aid in understanding invariant representations in hate speech. By\ndisentangling input into platform-dependent features (useful for predicting\nhate targets) and platform-independent features (used to predict the presence\nof hate), we learn invariant representations resistant to distribution shifts.\nThese features are then used to predict hate speech across unseen platforms.\nOur extensive experiments across four platforms highlight our model's enhanced\nefficacy compared to existing state-of-the-art methods in detecting generalized\nhate speech.\n","authors":["Paras Sheth","Tharindu Kumarage","Raha Moraffah","Aman Chadha","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16537v2","updated":"2023-08-03T23:23:43Z","published":"2023-03-29T08:59:44Z","title":"LMExplainer: a Knowledge-Enhanced Explainer for Language Models","summary":"  Large language models (LLMs) such as GPT-4 are very powerful and can process\ndifferent kinds of natural language processing (NLP) tasks. However, it can be\ndifficult to interpret the results due to the multi-layer nonlinear model\nstructure and millions of parameters. A lack of clarity and understanding of\nhow the language models (LMs) work can make them unreliable, difficult to\ntrust, and potentially dangerous for use in real-world scenarios. Most recent\nworks exploit attention weights to provide explanations for LM predictions.\nHowever, pure attention-based explanations are unable to support the growing\ncomplexity of LMs, and cannot reason about their decision-making processes. We\npropose LMExplainer, a knowledge-enhanced explainer for LMs that can provide\nhuman-understandable explanations. We use a knowledge graph (KG) and a graph\nattention neural network to extract the key decision signals of the LM. We\nfurther explore whether interpretation can also help the AI understand the task\nbetter. Our experimental results show that LMExplainer outperforms existing\nLM+KG methods on CommonsenseQA and OpenBookQA. We compare the explanation\nresults with generated explanation methods and human-annotated results. The\ncomparison shows our method can provide more comprehensive and clearer\nexplanations. LMExplainer demonstrates the potential to enhance model\nperformance and furnish explanations for the LM reasoning process in natural\nlanguage.\n","authors":["Zichen Chen","Ambuj K Singh","Misha Sra"],"pdf_url":"https://arxiv.org/pdf/2303.16537v2.pdf","comment":"12 pages, 1 figure, 7 tables, and 3 case studies"},{"id":"http://arxiv.org/abs/2308.02055v1","updated":"2023-08-03T21:14:25Z","published":"2023-08-03T21:14:25Z","title":"Seasonality Based Reranking of E-commerce Autocomplete Using Natural\n  Language Queries","summary":"  Query autocomplete (QAC) also known as typeahead, suggests list of complete\nqueries as user types prefix in the search box. It is one of the key features\nof modern search engines specially in e-commerce. One of the goals of typeahead\nis to suggest relevant queries to users which are seasonally important. In this\npaper we propose a neural network based natural language processing (NLP)\nalgorithm to incorporate seasonality as a signal and present end to end\nevaluation of the QAC ranking model. Incorporating seasonality into\nautocomplete ranking model can improve autocomplete relevance and business\nmetric.\n","authors":["Prateek Verma","Shan Zhong","Xiaoyu Liu","Adithya Rajan"],"pdf_url":"https://arxiv.org/pdf/2308.02055v1.pdf","comment":"Accepted at The 6th Workshop on e-Commerce and NLP (ECNLP 6), KDD'23,\n  Long Beach, CA"},{"id":"http://arxiv.org/abs/2308.02053v1","updated":"2023-08-03T21:12:54Z","published":"2023-08-03T21:12:54Z","title":"The Unequal Opportunities of Large Language Models: Revealing\n  Demographic Bias through Job Recommendations","summary":"  Large Language Models (LLMs) have seen widespread deployment in various\nreal-world applications. Understanding these biases is crucial to comprehend\nthe potential downstream consequences when using LLMs to make decisions,\nparticularly for historically disadvantaged groups. In this work, we propose a\nsimple method for analyzing and comparing demographic bias in LLMs, through the\nlens of job recommendations. We demonstrate the effectiveness of our method by\nmeasuring intersectional biases within ChatGPT and LLaMA, two cutting-edge\nLLMs. Our experiments primarily focus on uncovering gender identity and\nnationality bias; however, our method can be extended to examine biases\nassociated with any intersection of demographic identities. We identify\ndistinct biases in both models toward various demographic identities, such as\nboth models consistently suggesting low-paying jobs for Mexican workers or\npreferring to recommend secretarial roles to women. Our study highlights the\nimportance of measuring the bias of LLMs in downstream applications to\nunderstand the potential for harm and inequitable outcomes.\n","authors":["Abel Salinas","Parth Vipul Shah","Yuzhong Huang","Robert McCormack","Fred Morstatter"],"pdf_url":"https://arxiv.org/pdf/2308.02053v1.pdf","comment":"Accepted to EAAMO 2023"},{"id":"http://arxiv.org/abs/2308.02022v1","updated":"2023-08-03T20:29:27Z","published":"2023-08-03T20:29:27Z","title":"Efficient Sentiment Analysis: A Resource-Aware Evaluation of Feature\n  Extraction Techniques, Ensembling, and Deep Learning Models","summary":"  While reaching for NLP systems that maximize accuracy, other important\nmetrics of system performance are often overlooked. Prior models are easily\nforgotten despite their possible suitability in settings where large computing\nresources are unavailable or relatively more costly. In this paper, we perform\na broad comparative evaluation of document-level sentiment analysis models with\na focus on resource costs that are important for the feasibility of model\ndeployment and general climate consciousness. Our experiments consider\ndifferent feature extraction techniques, the effect of ensembling,\ntask-specific deep learning modeling, and domain-independent large language\nmodels (LLMs). We find that while a fine-tuned LLM achieves the best accuracy,\nsome alternate configurations provide huge (up to 24, 283 *) resource savings\nfor a marginal (<1%) loss in accuracy. Furthermore, we find that for smaller\ndatasets, the differences in accuracy shrink while the difference in resource\nconsumption grows further.\n","authors":["Mahammed Kamruzzaman","Gene Louis Kim"],"pdf_url":"https://arxiv.org/pdf/2308.02022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02019v1","updated":"2023-08-03T20:20:01Z","published":"2023-08-03T20:20:01Z","title":"Baby Llama: knowledge distillation from an ensemble of teachers trained\n  on a small dataset with no performance penalty","summary":"  We present our proposed solution to the BabyLM challenge [arXiv:2301.11796],\nwhose goal was to improve the sample efficiency of language models. We trained\nan ensemble consisting of a GPT-2 and small LLaMA models on the\ndevelopmentally-plausible, 10M-word BabyLM dataset, then distilled it into a\nsmall, 58M-parameter LLaMA model, which exceeds in performance both of its\nteachers as well as a similar model trained without distillation. This suggests\nthat distillation can not only retain the full performance of the teacher model\nwhen the latter is trained on a sufficiently small dataset; it can exceed it,\nand lead to significantly better performance than direct training.\n","authors":["Inar Timiryasov","Jean-Loup Tastet"],"pdf_url":"https://arxiv.org/pdf/2308.02019v1.pdf","comment":"11 pages, 4 figures, 4 tables, submitted to the BabyLM Challenge\n  (CoNLL--CMCL 2023 Shared Task)"},{"id":"http://arxiv.org/abs/2308.02013v1","updated":"2023-08-03T20:08:23Z","published":"2023-08-03T20:08:23Z","title":"Federated Representation Learning for Automatic Speech Recognition","summary":"  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge\ndevices to learn collaboratively without sharing data. Edge devices like Alexa\nand Siri are prospective sources of unlabeled audio data that can be tapped to\nlearn robust audio representations. In this work, we bring Self-supervised\nLearning (SSL) and FL together to learn representations for Automatic Speech\nRecognition respecting data privacy constraints. We use the speaker and chapter\ninformation in the unlabeled speech dataset, Libri-Light, to simulate non-IID\nspeaker-siloed data distributions and pre-train an LSTM encoder with the\nContrastive Predictive Coding framework with FedSGD. We show that the\npre-trained ASR encoder in FL performs as well as a centrally pre-trained model\nand produces an improvement of 12-15% (WER) compared to no pre-training. We\nfurther adapt the federated pre-trained models to a new language, French, and\nshow a 20% (WER) improvement over no pre-training.\n","authors":["Guruprasad V Rames","Gopinath Chennupati","Milind Rao","Anit Kumar Sahu","Ariya Rastrow","Jasha Droppo"],"pdf_url":"https://arxiv.org/pdf/2308.02013v1.pdf","comment":"Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy\n  in Speech Communication, 2023"},{"id":"http://arxiv.org/abs/2305.18340v2","updated":"2023-08-03T19:21:02Z","published":"2023-05-25T15:10:51Z","title":"Mapping ChatGPT in Mainstream Media to Unravel Jobs and Diversity\n  Challenges: Early Quantitative Insights through Sentiment Analysis and Word\n  Frequency Analysis","summary":"  The exponential growth in user acquisition and popularity of OpenAIs ChatGPT,\nan artificial intelligence(AI) powered chatbot, was accompanied by widespread\nmainstream media coverage. This article presents a quantitative data analysis\nof the early trends and sentiments revealed by conducting text mining and NLP\nmethods onto a corpus of 10,902 mainstream news headlines related to the\nsubject of ChatGPT and artificial intelligence, from the launch of ChatGPT in\nNovember 2022 to March 2023. The findings revealed in sentiment analysis,\nChatGPT and artificial intelligence, were perceived more positively than\nnegatively in the mainstream media. In regards to word frequency results, over\nsixty-five percent of the top frequency words were focused on Big Tech issues\nand actors while topics such as jobs, diversity, ethics, copyright, gender and\nwomen were poorly represented or completely absent and only accounted for six\npercent of the total corpus. This article is a critical analysis into the power\nstructures and collusions between Big Tech and Big Media in their hegemonic\nexclusion of diversity and job challenges from mainstream media.\n","authors":["Maya Karanouh"],"pdf_url":"https://arxiv.org/pdf/2305.18340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01987v1","updated":"2023-08-03T18:49:45Z","published":"2023-08-03T18:49:45Z","title":"Bengali Fake Reviews: A Benchmark Dataset and Detection System","summary":"  The proliferation of fake reviews on various online platforms has created a\nmajor concern for both consumers and businesses. Such reviews can deceive\ncustomers and cause damage to the reputation of products or services, making it\ncrucial to identify them. Although the detection of fake reviews has been\nextensively studied in English language, detecting fake reviews in non-English\nlanguages such as Bengali is still a relatively unexplored research area. This\npaper introduces the Bengali Fake Review Detection (BFRD) dataset, the first\npublicly available dataset for identifying fake reviews in Bengali. The dataset\nconsists of 7710 non-fake and 1339 fake food-related reviews collected from\nsocial media posts. To convert non-Bengali words in a review, a unique pipeline\nhas been proposed that translates English words to their corresponding Bengali\nmeaning and also back transliterates Romanized Bengali to Bengali. We have\nconducted rigorous experimentation using multiple deep learning and pre-trained\ntransformer language models to develop a reliable detection system. Finally, we\npropose a weighted ensemble model that combines four pre-trained transformers:\nBanglaBERT, BanglaBERT Base, BanglaBERT Large, and BanglaBERT Generator .\nAccording to the experiment results, the proposed ensemble model obtained a\nweighted F1-score of 0.9843 on 13390 reviews, including 1339 actual fake\nreviews and 5356 augmented fake reviews generated with the nlpaug library. The\nremaining 6695 reviews were randomly selected from the 7710 non-fake instances.\nThe model achieved a 0.9558 weighted F1-score when the fake reviews were\naugmented using the bnaug library.\n","authors":["G. M. Shahariar","Md. Tanvir Rouf Shawon","Faisal Muhammad Shah","Mohammad Shafiul Alam","Md. Shahriar Mahbub"],"pdf_url":"https://arxiv.org/pdf/2308.01987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01976v1","updated":"2023-08-03T18:11:00Z","published":"2023-08-03T18:11:00Z","title":"Domain specificity and data efficiency in typo tolerant spell checkers:\n  the case of search in online marketplaces","summary":"  Typographical errors are a major source of frustration for visitors of online\nmarketplaces. Because of the domain-specific nature of these marketplaces and\nthe very short queries users tend to search for, traditional spell cheking\nsolutions do not perform well in correcting typos. We present a data\naugmentation method to address the lack of annotated typo data and train a\nrecurrent neural network to learn context-limited domain-specific embeddings.\nThose embeddings are deployed in a real-time inferencing API for the Microsoft\nAppSource marketplace to find the closest match between a misspelled user query\nand the available product names. Our data efficient solution shows that\ncontrolled high quality synthetic data may be a powerful tool especially\nconsidering the current climate of large language models which rely on\nprohibitively huge and often uncontrolled datasets.\n","authors":["Dayananda Ubrangala","Juhi Sharma","Ravi Prasad Kondapalli","Kiran R","Amit Agarwala","Laurent Boué"],"pdf_url":"https://arxiv.org/pdf/2308.01976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09196v3","updated":"2023-08-03T03:44:47Z","published":"2022-12-19T00:04:56Z","title":"Emergent Analogical Reasoning in Large Language Models","summary":"  The recent advent of large language models has reinvigorated debate over\nwhether human cognitive capacities might emerge in such generic models given\nsufficient training data. Of particular interest is the ability of these models\nto reason about novel problems zero-shot, without any direct training. In human\ncognition, this capacity is closely tied to an ability to reason by analogy.\nHere, we performed a direct comparison between human reasoners and a large\nlanguage model (the text-davinci-003 variant of GPT-3) on a range of analogical\ntasks, including a non-visual matrix reasoning task based on the rule structure\nof Raven's Standard Progressive Matrices. We found that GPT-3 displayed a\nsurprisingly strong capacity for abstract pattern induction, matching or even\nsurpassing human capabilities in most settings; preliminary tests of GPT-4\nindicated even better performance. Our results indicate that large language\nmodels such as GPT-3 have acquired an emergent ability to find zero-shot\nsolutions to a broad range of analogy problems.\n","authors":["Taylor Webb","Keith J. Holyoak","Hongjing Lu"],"pdf_url":"https://arxiv.org/pdf/2212.09196v3.pdf","comment":"Published at Nature Human Behaviour (2023)\n  https://doi.org/10.1038/s41562-023-01659-w"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.01907v1","updated":"2023-08-03T17:59:47Z","published":"2023-08-03T17:59:47Z","title":"The All-Seeing Project: Towards Panoptic Visual Recognition and\n  Understanding of the Open World","summary":"  We present the All-Seeing (AS) project: a large-scale data and model for\nrecognizing and understanding everything in the open world. Using a scalable\ndata engine that incorporates human feedback and efficient models in the loop,\nwe create a new dataset (AS-1B) with over 1 billion regions annotated with\nsemantic tags, question-answering pairs, and detailed captions. It covers a\nwide range of 3.5 million common and rare concepts in the real world, and has\n132.2 billion tokens that describe the concepts and their attributes.\nLeveraging this new dataset, we develop the All-Seeing model (ASM), a unified\nframework for panoptic visual recognition and understanding. The model is\ntrained with open-ended language prompts and locations, which allows it to\ngeneralize to various vision and language tasks with remarkable zero-shot\nperformance, including region-text retrieval, region recognition, captioning,\nand question-answering. We hope that this project can serve as a foundation for\nvision-language artificial general intelligence research. Models and the\ndataset shall be released at https://github.com/OpenGVLab/All-Seeing, and demo\ncan be seen at https://huggingface.co/spaces/OpenGVLab/all-seeing.\n","authors":["Weiyun Wang","Min Shi","Qingyun Li","Wenhai Wang","Zhenhang Huang","Linjie Xing","Zhe Chen","Hao Li","Xizhou Zhu","Zhiguo Cao","Yushi Chen","Tong Lu","Jifeng Dai","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2308.01907v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.01905v1","updated":"2023-08-03T17:59:06Z","published":"2023-08-03T17:59:06Z","title":"Revisiting Deformable Convolution for Depth Completion","summary":"  Depth completion, which aims to generate high-quality dense depth maps from\nsparse depth maps, has attracted increasing attention in recent years. Previous\nwork usually employs RGB images as guidance, and introduces iterative spatial\npropagation to refine estimated coarse depth maps. However, most of the\npropagation refinement methods require several iterations and suffer from a\nfixed receptive field, which may contain irrelevant and useless information\nwith very sparse input. In this paper, we address these two challenges\nsimultaneously by revisiting the idea of deformable convolution. We propose an\neffective architecture that leverages deformable kernel convolution as a\nsingle-pass refinement module, and empirically demonstrate its superiority. To\nbetter understand the function of deformable convolution and exploit it for\ndepth completion, we further systematically investigate a variety of\nrepresentative strategies. Our study reveals that, different from prior work,\ndeformable convolution needs to be applied on an estimated depth map with a\nrelatively high density for better performance. We evaluate our model on the\nlarge-scale KITTI dataset and achieve state-of-the-art level performance in\nboth accuracy and inference speed. Our code is available at\nhttps://github.com/AlexSunNik/ReDC.\n","authors":["Xinglong Sun","Jean Ponce","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01905v1.pdf","comment":"Accepted and going to appear at IROS2023"},{"id":"http://arxiv.org/abs/2308.01904v1","updated":"2023-08-03T17:59:04Z","published":"2023-08-03T17:59:04Z","title":"DETR Doesn't Need Multi-Scale or Locality Design","summary":"  This paper presents an improved DETR detector that maintains a \"plain\"\nnature: using a single-scale feature map and global cross-attention\ncalculations without specific locality constraints, in contrast to previous\nleading DETR-based detectors that reintroduce architectural inductive biases of\nmulti-scale and locality into the decoder. We show that two simple technologies\nare surprisingly effective within a plain design to compensate for the lack of\nmulti-scale feature maps and locality constraints. The first is a box-to-pixel\nrelative position bias (BoxRPB) term added to the cross-attention formulation,\nwhich well guides each query to attend to the corresponding object region while\nalso providing encoding flexibility. The second is masked image modeling\n(MIM)-based backbone pre-training which helps learn representation with\nfine-grained localization ability and proves crucial for remedying dependencies\non the multi-scale feature maps. By incorporating these technologies and recent\nadvancements in training and problem formation, the improved \"plain\" DETR\nshowed exceptional improvements over the original DETR detector. By leveraging\nthe Object365 dataset for pre-training, it achieved 63.9 mAP accuracy using a\nSwin-L backbone, which is highly competitive with state-of-the-art detectors\nwhich all heavily rely on multi-scale feature maps and region-based feature\nextraction. Code is available at https://github.com/impiga/Plain-DETR .\n","authors":["Yutong Lin","Yuhui Yuan","Zheng Zhang","Chen Li","Nanning Zheng","Han Hu"],"pdf_url":"https://arxiv.org/pdf/2308.01904v1.pdf","comment":"To be published in ICCV2023"},{"id":"http://arxiv.org/abs/2306.06051v2","updated":"2023-08-03T17:58:20Z","published":"2023-06-09T17:21:52Z","title":"Higher Chest X-ray Resolution Improves Classification Performance","summary":"  Deep learning models for image classification are often trained at a\nresolution of 224 x 224 pixels for historical and efficiency reasons. However,\nchest X-rays are acquired at a much higher resolution to display subtle\npathologies. This study investigates the effect of training resolution on chest\nX-ray classification performance, using the chest X-ray 14 dataset. The results\nshow that training with a higher image resolution, specifically 1024 x 1024\npixels, results in the best overall classification performance with a mean AUC\nof 84.2 % compared to 82.7 % when trained with 256 x 256 pixel images.\nAdditionally, comparison of bounding boxes and GradCAM saliency maps suggest\nthat low resolutions, such as 256 x 256 pixels, are insufficient for\nidentifying small pathologies and force the model to use spurious\ndiscriminating features. Our code is publicly available at\nhttps://gitlab.lrz.de/IP/cxr-resolution\n","authors":["Alessandro Wollek","Sardi Hyska","Bastian Sabel","Michael Ingrisch","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2306.06051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01898v1","updated":"2023-08-03T17:56:06Z","published":"2023-08-03T17:56:06Z","title":"UniSim: A Neural Closed-Loop Sensor Simulator","summary":"  Rigorously testing autonomy systems is essential for making safe self-driving\nvehicles (SDV) a reality. It requires one to generate safety critical scenarios\nbeyond what can be collected safely in the world, as many scenarios happen\nrarely on public roads. To accurately evaluate performance, we need to test the\nSDV on these scenarios in closed-loop, where the SDV and other actors interact\nwith each other at each timestep. Previously recorded driving logs provide a\nrich resource to build these new scenarios from, but for closed loop\nevaluation, we need to modify the sensor data based on the new scene\nconfiguration and the SDV's decisions, as actors might be added or removed and\nthe trajectories of existing actors and the SDV will differ from the original\nlog. In this paper, we present UniSim, a neural sensor simulator that takes a\nsingle recorded log captured by a sensor-equipped vehicle and converts it into\na realistic closed-loop multi-sensor simulation. UniSim builds neural feature\ngrids to reconstruct both the static background and dynamic actors in the\nscene, and composites them together to simulate LiDAR and camera data at new\nviewpoints, with actors added or removed and at new placements. To better\nhandle extrapolated views, we incorporate learnable priors for dynamic objects,\nand leverage a convolutional network to complete unseen regions. Our\nexperiments show UniSim can simulate realistic sensor data with small domain\ngap on downstream tasks. With UniSim, we demonstrate closed-loop evaluation of\nan autonomy system on safety-critical scenarios as if it were in the real\nworld.\n","authors":["Ze Yang","Yun Chen","Jingkang Wang","Sivabalan Manivasagam","Wei-Chiu Ma","Anqi Joyce Yang","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2308.01898v1.pdf","comment":"CVPR 2023 Highlight. Project page: https://waabi.ai/research/unisim/"},{"id":"http://arxiv.org/abs/2308.00692v2","updated":"2023-08-03T17:38:21Z","published":"2023-08-01T17:50:17Z","title":"LISA: Reasoning Segmentation via Large Language Model","summary":"  Although perception systems have made remarkable advancements in recent\nyears, they still rely on explicit human instruction to identify the target\nobjects or categories before executing visual recognition tasks. Such systems\nlack the ability to actively reason and comprehend implicit user intentions. In\nthis work, we propose a new segmentation task -- reasoning segmentation. The\ntask is designed to output a segmentation mask given a complex and implicit\nquery text. Furthermore, we establish a benchmark comprising over one thousand\nimage-instruction pairs, incorporating intricate reasoning and world knowledge\nfor evaluation purposes. Finally, we present LISA: large Language Instructed\nSegmentation Assistant, which inherits the language generation capabilities of\nthe multi-modal Large Language Model (LLM) while also possessing the ability to\nproduce segmentation masks. We expand the original vocabulary with a <SEG>\ntoken and propose the embedding-as-mask paradigm to unlock the segmentation\ncapability. Remarkably, LISA can handle cases involving: 1) complex reasoning;\n2) world knowledge; 3) explanatory answers; 4) multi-turn conversation. Also,\nit demonstrates robust zero-shot capability when trained exclusively on\nreasoning-free datasets. In addition, fine-tuning the model with merely 239\nreasoning segmentation image-instruction pairs results in further performance\nenhancement. Experiments show our method not only unlocks new reasoning\nsegmentation capabilities but also proves effective in both complex reasoning\nsegmentation and standard referring segmentation tasks. Code, models, and demo\nare at https://github.com/dvlab-research/LISA.\n","authors":["Xin Lai","Zhuotao Tian","Yukang Chen","Yanwei Li","Yuhui Yuan","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2308.00692v2.pdf","comment":"Code, models, and demo are available at\n  https://github.com/dvlab-research/LISA"},{"id":"http://arxiv.org/abs/2308.01890v1","updated":"2023-08-03T17:33:20Z","published":"2023-08-03T17:33:20Z","title":"DualCoOp++: Fast and Effective Adaptation to Multi-Label Recognition\n  with Limited Annotations","summary":"  Multi-label image recognition in the low-label regime is a task of great\nchallenge and practical significance. Previous works have focused on learning\nthe alignment between textual and visual spaces to compensate for limited image\nlabels, yet may suffer from reduced accuracy due to the scarcity of\nhigh-quality multi-label annotations. In this research, we leverage the\npowerful alignment between textual and visual features pretrained with millions\nof auxiliary image-text pairs. We introduce an efficient and effective\nframework called Evidence-guided Dual Context Optimization (DualCoOp++), which\nserves as a unified approach for addressing partial-label and zero-shot\nmulti-label recognition. In DualCoOp++ we separately encode evidential,\npositive, and negative contexts for target classes as parametric components of\nthe linguistic input (i.e., prompts). The evidential context aims to discover\nall the related visual content for the target class, and serves as guidance to\naggregate positive and negative contexts from the spatial domain of the image,\nenabling better distinguishment between similar categories. Additionally, we\nintroduce a Winner-Take-All module that promotes inter-class interaction during\ntraining, while avoiding the need for extra parameters and costs. As DualCoOp++\nimposes minimal additional learnable overhead on the pretrained vision-language\nframework, it enables rapid adaptation to multi-label recognition tasks with\nlimited annotations and even unseen classes. Experiments on standard\nmulti-label recognition benchmarks across two challenging low-label settings\ndemonstrate the superior performance of our approach compared to\nstate-of-the-art methods.\n","authors":["Ping Hu","Ximeng Sun","Stan Sclaroff","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2308.01890v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible. arXiv admin note: substantial text overlap with\n  arXiv:2206.09541"},{"id":"http://arxiv.org/abs/2308.01888v1","updated":"2023-08-03T17:31:22Z","published":"2023-08-03T17:31:22Z","title":"FROD: Robust Object Detection for Free","summary":"  Object detection is a vital task in computer vision and has become an\nintegral component of numerous critical systems. However, state-of-the-art\nobject detectors, similar to their classification counterparts, are susceptible\nto small adversarial perturbations that can significantly alter their normal\nbehavior. Unlike classification, the robustness of object detectors has not\nbeen thoroughly explored. In this work, we take the initial step towards\nbridging the gap between the robustness of classification and object detection\nby leveraging adversarially trained classification models. Merely utilizing\nadversarially trained models as backbones for object detection does not result\nin robustness. We propose effective modifications to the classification-based\nbackbone to instill robustness in object detection without incurring any\ncomputational overhead. To further enhance the robustness achieved by the\nproposed modified backbone, we introduce two lightweight components: imitation\nloss and delayed adversarial training. Extensive experiments on the MS-COCO and\nPascal VOC datasets are conducted to demonstrate the effectiveness of our\nproposed approach.\n","authors":[" Muhammad"," Awais"," Weiming"," Zhuang"," Lingjuan"," Lyu"," Sung-Ho"," Bae"],"pdf_url":"https://arxiv.org/pdf/2308.01888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05357v2","updated":"2023-08-03T17:07:41Z","published":"2023-06-08T17:02:15Z","title":"Unsupervised Compositional Concepts Discovery with Text-to-Image\n  Generative Models","summary":"  Text-to-image generative models have enabled high-resolution image synthesis\nacross different domains, but require users to specify the content they wish to\ngenerate. In this paper, we consider the inverse problem -- given a collection\nof different images, can we discover the generative concepts that represent\neach image? We present an unsupervised approach to discover generative concepts\nfrom a collection of images, disentangling different art styles in paintings,\nobjects, and lighting from kitchen scenes, and discovering image classes given\nImageNet images. We show how such generative concepts can accurately represent\nthe content of images, be recombined and composed to generate new artistic and\nhybrid images, and be further used as a representation for downstream\nclassification tasks.\n","authors":["Nan Liu","Yilun Du","Shuang Li","Joshua B. Tenenbaum","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2306.05357v2.pdf","comment":"ICCV 2023. Project Webpage:\n  https://energy-based-model.github.io/unsupervised-concept-discovery/"},{"id":"http://arxiv.org/abs/2306.11363v2","updated":"2023-08-03T16:55:34Z","published":"2023-06-20T08:02:59Z","title":"Masked Diffusion Models Are Fast and Privacy-Aware Learners","summary":"  Diffusion models have emerged as the \\emph{de-facto} technique for image\ngeneration, yet they entail significant computational overhead, hindering the\ntechnique's broader application in the research community. We propose a\nprior-based denoising training framework, the first to incorporate the\npre-train and fine-tune paradigm into the diffusion model training process,\nwhich substantially improves training efficiency and shows potential in\nfacilitating various downstream tasks. Our approach centers on masking a high\nproportion (e.g., up to 90\\%) of the input image and employing masked denoising\nscore matching to denoise the visible areas, thereby guiding the diffusion\nmodel to learn more salient features from training data as prior knowledge. By\nutilizing masked learning in a pre-training stage, we efficiently train the\nViT-based diffusion model on CelebA-HQ $256 \\times 256$ in the pixel space,\nachieving a 4x acceleration and enhancing the quality of generated images\ncompared to denoising diffusion probabilistic model (DDPM). Moreover, our\nmasked pre-training technique can be universally applied to various diffusion\nmodels that directly generate images in the pixel space, aiding in the learning\nof pre-trained models with superior generalizability. For instance, a diffusion\nmodel pre-trained on VGGFace2 attains a 46\\% quality improvement through\nfine-tuning with merely 10\\% data from a different distribution. Moreover, our\nmethod shows the potential to serve as a training paradigm for enhancing the\nprivacy protection capabilities of diffusion models. Our code is available at\n\\url{https://github.com/jiachenlei/maskdm}.\n","authors":["Jiachen Lei","Peng Cheng","Zhongjie Ba","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2306.11363v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06038v2","updated":"2023-08-03T16:21:00Z","published":"2023-06-09T17:02:26Z","title":"WindowNet: Learnable Windows for Chest X-ray Classification","summary":"  Chest X-ray (CXR) images are commonly compressed to a lower resolution and\nbit depth to reduce their size, potentially altering subtle diagnostic\nfeatures.\n  Radiologists use windowing operations to enhance image contrast, but the\nimpact of such operations on CXR classification performance is unclear.\n  In this study, we show that windowing can improve CXR classification\nperformance, and propose WindowNet, a model that learns optimal window\nsettings.\n  We first investigate the impact of bit-depth on classification performance\nand find that a higher bit-depth (12-bit) leads to improved performance.\n  We then evaluate different windowing settings and show that training with a\ndistinct window generally improves pathology-wise classification performance.\n  Finally, we propose and evaluate WindowNet, a model that learns optimal\nwindow settings, and show that it significantly improves performance compared\nto the baseline model without windowing.\n","authors":["Alessandro Wollek","Sardi Hyska","Bastian Sabel","Michael Ingrisch","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2306.06038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01854v1","updated":"2023-08-03T16:20:33Z","published":"2023-08-03T16:20:33Z","title":"Reconstructing Three-Dimensional Models of Interacting Humans","summary":"  Understanding 3d human interactions is fundamental for fine-grained scene\nanalysis and behavioural modeling. However, most of the existing models predict\nincorrect, lifeless 3d estimates, that miss the subtle human contact\naspects--the essence of the event--and are of little use for detailed\nbehavioral understanding. This paper addresses such issues with several\ncontributions: (1) we introduce models for interaction signature estimation\n(ISP) encompassing contact detection, segmentation, and 3d contact signature\nprediction; (2) we show how such components can be leveraged to ensure contact\nconsistency during 3d reconstruction; (3) we construct several large datasets\nfor learning and evaluating 3d contact prediction and reconstruction methods;\nspecifically, we introduce CHI3D, a lab-based accurate 3d motion capture\ndataset with 631 sequences containing $2,525$ contact events, $728,664$ ground\ntruth 3d poses, as well as FlickrCI3D, a dataset of $11,216$ images, with\n$14,081$ processed pairs of people, and $81,233$ facet-level surface\ncorrespondences. Finally, (4) we propose methodology for recovering the\nground-truth pose and shape of interacting people in a controlled setup and (5)\nannotate all 3d interaction motions in CHI3D with textual descriptions. Motion\ndata in multiple formats (GHUM and SMPLX parameters, Human3.6m 3d joints) is\nmade available for research purposes at \\url{https://ci3d.imar.ro}, together\nwith an evaluation server and a public benchmark.\n","authors":["Mihai Fieraru","Mihai Zanfir","Elisabeta Oneata","Alin-Ionut Popa","Vlad Olaru","Cristian Sminchisescu"],"pdf_url":"https://arxiv.org/pdf/2308.01854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01850v1","updated":"2023-08-03T16:18:32Z","published":"2023-08-03T16:18:32Z","title":"Synthesizing Long-Term Human Motions with Diffusion Models via Coherent\n  Sampling","summary":"  Text-to-motion generation has gained increasing attention, but most existing\nmethods are limited to generating short-term motions that correspond to a\nsingle sentence describing a single action. However, when a text stream\ndescribes a sequence of continuous motions, the generated motions corresponding\nto each sentence may not be coherently linked. Existing long-term motion\ngeneration methods face two main issues. Firstly, they cannot directly generate\ncoherent motions and require additional operations such as interpolation to\nprocess the generated actions. Secondly, they generate subsequent actions in an\nautoregressive manner without considering the influence of future actions on\nprevious ones. To address these issues, we propose a novel approach that\nutilizes a past-conditioned diffusion model with two optional coherent sampling\nmethods: Past Inpainting Sampling and Compositional Transition Sampling. Past\nInpainting Sampling completes subsequent motions by treating previous motions\nas conditions, while Compositional Transition Sampling models the distribution\nof the transition as the composition of two adjacent motions guided by\ndifferent text prompts. Our experimental results demonstrate that our proposed\nmethod is capable of generating compositional and coherent long-term 3D human\nmotions controlled by a user-instructed long text stream. The code is available\nat\n\\href{https://github.com/yangzhao1230/PCMDM}{https://github.com/yangzhao1230/PCMDM}.\n","authors":["Zhao Yang","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01850v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.01839v1","updated":"2023-08-03T16:04:14Z","published":"2023-08-03T16:04:14Z","title":"Is your data alignable? Principled and interpretable alignability\n  testing and integration of single-cell data","summary":"  Single-cell data integration can provide a comprehensive molecular view of\ncells, and many algorithms have been developed to remove unwanted technical or\nbiological variations and integrate heterogeneous single-cell datasets. Despite\ntheir wide usage, existing methods suffer from several fundamental limitations.\nIn particular, we lack a rigorous statistical test for whether two\nhigh-dimensional single-cell datasets are alignable (and therefore should even\nbe aligned). Moreover, popular methods can substantially distort the data\nduring alignment, making the aligned data and downstream analysis difficult to\ninterpret. To overcome these limitations, we present a spectral manifold\nalignment and inference (SMAI) framework, which enables principled and\ninterpretable alignability testing and structure-preserving integration of\nsingle-cell data. SMAI provides a statistical test to robustly determine the\nalignability between datasets to avoid misleading inference, and is justified\nby high-dimensional statistical theory. On a diverse range of real and\nsimulated benchmark datasets, it outperforms commonly used alignment methods.\nMoreover, we show that SMAI improves various downstream analyses such as\nidentification of differentially expressed genes and imputation of single-cell\nspatial transcriptomics, providing further biological insights. SMAI's\ninterpretability also enables quantification and a deeper understanding of the\nsources of technical confounders in single-cell data.\n","authors":["Rong Ma","Eric D. Sun","David Donoho","James Zou"],"pdf_url":"https://arxiv.org/pdf/2308.01839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10406v2","updated":"2023-08-03T15:22:05Z","published":"2023-05-17T17:47:19Z","title":"Variational Classification","summary":"  We present a latent variable generalisation of neural network softmax\nclassification trained with cross-entropy loss, referred to as variational\nclassification (VC). Our approach offers a novel probabilistic perspective on\nthe highly familiar softmax classification model, to which it relates similarly\nto how variational and traditional autoencoders relate. We derive a training\nobjective based on the evidence lower bound (ELBO) that is non-trivial to\noptimize, and therefore propose an adversarial approach to maximise it. We show\nthat VC addresses an inherent inconsistency within softmax classification,\nwhilst also allowing more flexible choices of prior distributions in the latent\nspace in place of implicit assumptions revealed within off-the-shelf softmax\nclassifiers. Empirical evaluation on image and text classification datasets\ndemonstrates that variational classification maintains prediction accuracy\nwhile improving other desirable properties such as calibration and adversarial\nrobustness, particularly under distribution shift and low data settings.\n","authors":["Shehzaad Dhuliawala","Mrinmaya Sachan","Carl Allen"],"pdf_url":"https://arxiv.org/pdf/2305.10406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01813v1","updated":"2023-08-03T15:21:08Z","published":"2023-08-03T15:21:08Z","title":"Deep Neural Networks Fused with Textures for Image Classification","summary":"  Fine-grained image classification (FGIC) is a challenging task in computer\nvision for due to small visual differences among inter-subcategories, but,\nlarge intra-class variations. Deep learning methods have achieved remarkable\nsuccess in solving FGIC. In this paper, we propose a fusion approach to address\nFGIC by combining global texture with local patch-based information. The first\npipeline extracts deep features from various fixed-size non-overlapping patches\nand encodes features by sequential modelling using the long short-term memory\n(LSTM). Another path computes image-level textures at multiple scales using the\nlocal binary patterns (LBP). The advantages of both streams are integrated to\nrepresent an efficient feature vector for image classification. The method is\ntested on eight datasets representing the human faces, skin lesions, food\ndishes, marine lives, etc. using four standard backbone CNNs. Our method has\nattained better classification accuracy over existing methods with notable\nmargins.\n","authors":["Asish Bera","Debotosh Bhattacharjee","Mita Nasipuri"],"pdf_url":"https://arxiv.org/pdf/2308.01813v1.pdf","comment":"14 pages, 6 figures, 4 tables, conference"},{"id":"http://arxiv.org/abs/2308.01810v1","updated":"2023-08-03T15:17:24Z","published":"2023-08-03T15:17:24Z","title":"An End-to-end Food Portion Estimation Framework Based on Shape\n  Reconstruction from Monocular Image","summary":"  Dietary assessment is a key contributor to monitoring health status. Existing\nself-report methods are tedious and time-consuming with substantial biases and\nerrors. Image-based food portion estimation aims to estimate food energy values\ndirectly from food images, showing great potential for automated dietary\nassessment solutions. Existing image-based methods either use a single-view\nimage or incorporate multi-view images and depth information to estimate the\nfood energy, which either has limited performance or creates user burdens. In\nthis paper, we propose an end-to-end deep learning framework for food energy\nestimation from a monocular image through 3D shape reconstruction. We leverage\na generative model to reconstruct the voxel representation of the food object\nfrom the input image to recover the missing 3D information. Our method is\nevaluated on a publicly available food image dataset Nutrition5k, resulting a\nMean Absolute Error (MAE) of 40.05 kCal and Mean Absolute Percentage Error\n(MAPE) of 11.47% for food energy estimation. Our method uses RGB image as the\nonly input at the inference stage and achieves competitive results compared to\nthe existing method requiring both RGB and depth information.\n","authors":["Zeman Shao","Gautham Vinod","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.01810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01804v1","updated":"2023-08-03T15:06:23Z","published":"2023-08-03T15:06:23Z","title":"QUEST: Query Stream for Vehicle-Infrastructure Cooperative Perception","summary":"  Cooperative perception can effectively enhance individual perception\nperformance by providing additional viewpoint and expanding the sensing field.\nExisting cooperation paradigms are either interpretable (result cooperation) or\nflexible (feature cooperation). In this paper, we propose the concept of query\ncooperation to enable interpretable instance-level flexible feature\ninteraction. To specifically explain the concept, we propose a cooperative\nperception framework, termed QUEST, which let query stream flow among agents.\nThe cross-agent queries are interacted via fusion for co-aware instances and\ncomplementation for individual unaware instances. Taking camera-based\nvehicle-infrastructure perception as a typical practical application scene, the\nexperimental results on the real-world dataset, DAIR-V2X-Seq, demonstrate the\neffectiveness of QUEST and further reveal the advantage of the query\ncooperation paradigm on transmission flexibility and robustness to packet\ndropout. We hope our work can further facilitate the cross-agent representation\ninteraction for better cooperative perception in practice.\n","authors":["Siqi Fan","Haibao Yu","Wenxian Yang","Jirui Yuan","Zaiqing Nie"],"pdf_url":"https://arxiv.org/pdf/2308.01804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00471v2","updated":"2023-08-03T14:48:15Z","published":"2023-08-01T11:49:05Z","title":"A Deep Learning Approach for Virtual Contrast Enhancement in Contrast\n  Enhanced Spectral Mammography","summary":"  Contrast Enhanced Spectral Mammography (CESM) is a dual-energy mammographic\nimaging technique that first needs intravenously administration of an iodinated\ncontrast medium; then, it collects both a low-energy image, comparable to\nstandard mammography, and a high-energy image. The two scans are then combined\nto get a recombined image showing contrast enhancement. Despite CESM diagnostic\nadvantages for breast cancer diagnosis, the use of contrast medium can cause\nside effects, and CESM also beams patients with a higher radiation dose\ncompared to standard mammography. To address these limitations this work\nproposes to use deep generative models for virtual contrast enhancement on\nCESM, aiming to make the CESM contrast-free as well as to reduce the radiation\ndose. Our deep networks, consisting of an autoencoder and two Generative\nAdversarial Networks, the Pix2Pix, and the CycleGAN, generate synthetic\nrecombined images solely from low-energy images. We perform an extensive\nquantitative and qualitative analysis of the model's performance, also\nexploiting radiologists' assessments, on a novel CESM dataset that includes\n1138 images that, as a further contribution of this work, we make publicly\navailable. The results show that CycleGAN is the most promising deep network to\ngenerate synthetic recombined images, highlighting the potential of artificial\nintelligence techniques for virtual contrast enhancement in this field.\n","authors":["Aurora Rofena","Valerio Guarrasi","Marina Sarli","Claudia Lucia Piccolo","Matteo Sammarra","Bruno Beomonte Zobel","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2308.00471v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.09957v3","updated":"2023-08-03T14:40:25Z","published":"2022-03-18T13:49:25Z","title":"Enhancement of Novel View Synthesis Using Omnidirectional Image\n  Completion","summary":"  In this study, we present a method for synthesizing novel views from a single\n360-degree RGB-D image based on the neural radiance field (NeRF) . Prior\nstudies relied on the neighborhood interpolation capability of multi-layer\nperceptrons to complete missing regions caused by occlusion and zooming, which\nleads to artifacts. In the method proposed in this study, the input image is\nreprojected to 360-degree RGB images at other camera positions, the missing\nregions of the reprojected images are completed by a 2D image generative model,\nand the completed images are utilized to train the NeRF. Because multiple\ncompleted images contain inconsistencies in 3D, we introduce a method to learn\nthe NeRF model using a subset of completed images that cover the target scene\nwith less overlap of completed regions. The selection of such a subset of\nimages can be attributed to the maximum weight independent set problem, which\nis solved through simulated annealing. Experiments demonstrated that the\nproposed method can synthesize plausible novel views while preserving the\nfeatures of the scene for both artificial and real-world data.\n","authors":["Takayuki Hara","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2203.09957v3.pdf","comment":"20 pages, 19 figures"},{"id":"http://arxiv.org/abs/2305.18905v3","updated":"2023-08-03T14:26:57Z","published":"2023-05-30T10:00:15Z","title":"atTRACTive: Semi-automatic white matter tract segmentation using active\n  learning","summary":"  Accurately identifying white matter tracts in medical images is essential for\nvarious applications, including surgery planning and tract-specific analysis.\nSupervised machine learning models have reached state-of-the-art solving this\ntask automatically. However, these models are primarily trained on healthy\nsubjects and struggle with strong anatomical aberrations, e.g. caused by brain\ntumors. This limitation makes them unsuitable for tasks such as preoperative\nplanning, wherefore time-consuming and challenging manual delineation of the\ntarget tract is typically employed. We propose semi-automatic entropy-based\nactive learning for quick and intuitive segmentation of white matter tracts\nfrom whole-brain tractography consisting of millions of streamlines. The method\nis evaluated on 21 openly available healthy subjects from the Human Connectome\nProject and an internal dataset of ten neurosurgical cases. With only a few\nannotations, the proposed approach enables segmenting tracts on tumor cases\ncomparable to healthy subjects (dice=0.71), while the performance of automatic\nmethods, like TractSeg dropped substantially (dice=0.34) in comparison to\nhealthy subjects. The method is implemented as a prototype named atTRACTive in\nthe freely available software MITK Diffusion. Manual experiments on tumor data\nshowed higher efficiency due to lower segmentation times compared to\ntraditional ROI-based segmentation.\n","authors":["Robin Peretzke","Klaus Maier-Hein","Jonas Bohn","Yannick Kirchhoff","Saikat Roy","Sabrina Oberli-Palma","Daniela Becker","Pavlina Lenga","Peter Neher"],"pdf_url":"https://arxiv.org/pdf/2305.18905v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01779v1","updated":"2023-08-03T14:11:56Z","published":"2023-08-03T14:11:56Z","title":"Point2Mask: Point-supervised Panoptic Segmentation via Optimal Transport","summary":"  Weakly-supervised image segmentation has recently attracted increasing\nresearch attentions, aiming to avoid the expensive pixel-wise labeling. In this\npaper, we present an effective method, namely Point2Mask, to achieve\nhigh-quality panoptic prediction using only a single random point annotation\nper target for training. Specifically, we formulate the panoptic pseudo-mask\ngeneration as an Optimal Transport (OT) problem, where each ground-truth (gt)\npoint label and pixel sample are defined as the label supplier and consumer,\nrespectively. The transportation cost is calculated by the introduced\ntask-oriented maps, which focus on the category-wise and instance-wise\ndifferences among the various thing and stuff targets. Furthermore, a\ncentroid-based scheme is proposed to set the accurate unit number for each gt\npoint supplier. Hence, the pseudo-mask generation is converted into finding the\noptimal transport plan at a globally minimal transportation cost, which can be\nsolved via the Sinkhorn-Knopp Iteration. Experimental results on Pascal VOC and\nCOCO demonstrate the promising performance of our proposed Point2Mask approach\nto point-supervised panoptic segmentation. Source code is available at:\nhttps://github.com/LiWentomng/Point2Mask.\n","authors":["Wentong Li","Yuqian Yuan","Song Wang","Jianke Zhu","Jianshu Li","Jian Liu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01779v1.pdf","comment":"14 pages, 8 figures, ICCV2023"},{"id":"http://arxiv.org/abs/2112.15402v3","updated":"2023-08-03T14:00:42Z","published":"2021-12-31T12:05:22Z","title":"Relational Experience Replay: Continual Learning by Adaptively Tuning\n  Task-wise Relationship","summary":"  Continual learning is a promising machine learning paradigm to learn new\ntasks while retaining previously learned knowledge over streaming training\ndata. Till now, rehearsal-based methods, keeping a small part of data from old\ntasks as a memory buffer, have shown good performance in mitigating\ncatastrophic forgetting for previously learned knowledge. However, most of\nthese methods typically treat each new task equally, which may not adequately\nconsider the relationship or similarity between old and new tasks. Furthermore,\nthese methods commonly neglect sample importance in the continual training\nprocess and result in sub-optimal performance on certain tasks. To address this\nchallenging problem, we propose Relational Experience Replay (RER), a bi-level\nlearning framework, to adaptively tune task-wise relationships and sample\nimportance within each task to achieve a better `stability' and `plasticity'\ntrade-off. As such, the proposed method is capable of accumulating new\nknowledge while consolidating previously learned old knowledge during continual\nlearning. Extensive experiments conducted on three publicly available datasets\n(i.e., CIFAR-10, CIFAR-100, and Tiny ImageNet) show that the proposed method\ncan consistently improve the performance of all baselines and surpass current\nstate-of-the-art methods.\n","authors":["Quanziang Wang","Renzhen Wang","Yuexiang Li","Dong Wei","Kai Ma","Yefeng Zheng","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2112.15402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01771v1","updated":"2023-08-03T14:00:01Z","published":"2023-08-03T14:00:01Z","title":"Deep Learning-based Prediction of Stress and Strain Maps in Arterial\n  Walls for Improved Cardiovascular Risk Assessment","summary":"  This study investigated the potential of end-to-end deep learning tools as a\nmore effective substitute for FEM in predicting stress-strain fields within 2D\ncross sections of arterial wall. We first proposed a U-Net based fully\nconvolutional neural network (CNN) to predict the von Mises stress and strain\ndistribution based on the spatial arrangement of calcification within arterial\nwall cross-sections. Further, we developed a conditional generative adversarial\nnetwork (cGAN) to enhance, particularly from the perceptual perspective, the\nprediction accuracy of stress and strain field maps for arterial walls with\nvarious calcification quantities and spatial configurations. On top of U-Net\nand cGAN, we also proposed their ensemble approaches, respectively, to further\nimprove the prediction accuracy of field maps. Our dataset, consisting of input\nand output images, was generated by implementing boundary conditions and\nextracting stress-strain field maps. The trained U-Net models can accurately\npredict von Mises stress and strain fields, with structural similarity index\nscores (SSIM) of 0.854 and 0.830 and mean squared errors of 0.017 and 0.018 for\nstress and strain, respectively, on a reserved test set. Meanwhile, the cGAN\nmodels in a combination of ensemble and transfer learning techniques\ndemonstrate high accuracy in predicting von Mises stress and strain fields, as\nevidenced by SSIM scores of 0.890 for stress and 0.803 for strain.\nAdditionally, mean squared errors of 0.008 for stress and 0.017 for strain\nfurther support the model's performance on a designated test set. Overall, this\nstudy developed a surrogate model for finite element analysis, which can\naccurately and efficiently predict stress-strain fields of arterial walls\nregardless of complex geometries and boundary conditions.\n","authors":["Yasin Shokrollahi1","Pengfei Dong1","Xianqi Li","Linxia Gu"],"pdf_url":"https://arxiv.org/pdf/2308.01771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01769v1","updated":"2023-08-03T13:58:37Z","published":"2023-08-03T13:58:37Z","title":"Focus on Content not Noise: Improving Image Generation for Nuclei\n  Segmentation by Suppressing Steganography in CycleGAN","summary":"  Annotating nuclei in microscopy images for the training of neural networks is\na laborious task that requires expert knowledge and suffers from inter- and\nintra-rater variability, especially in fluorescence microscopy. Generative\nnetworks such as CycleGAN can inverse the process and generate synthetic\nmicroscopy images for a given mask, thereby building a synthetic dataset.\nHowever, past works report content inconsistencies between the mask and\ngenerated image, partially due to CycleGAN minimizing its loss by hiding\nshortcut information for the image reconstruction in high frequencies rather\nthan encoding the desired image content and learning the target task. In this\nwork, we propose to remove the hidden shortcut information, called\nsteganography, from generated images by employing a low pass filtering based on\nthe DCT. We show that this increases coherence between generated images and\ncycled masks and evaluate synthetic datasets on a downstream nuclei\nsegmentation task. Here we achieve an improvement of 5.4 percentage points in\nthe F1-score compared to a vanilla CycleGAN. Integrating advanced\nregularization techniques into the CycleGAN architecture may help mitigate\nsteganography-related issues and produce more accurate synthetic datasets for\nnuclei segmentation.\n","authors":["Jonas Utz","Tobias Weise","Maja Schlereth","Fabian Wagner","Mareike Thies","Mingxuan Gu","Stefan Uderhardt","Katharina Breininger"],"pdf_url":"https://arxiv.org/pdf/2308.01769v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.01768v1","updated":"2023-08-03T13:58:21Z","published":"2023-08-03T13:58:21Z","title":"A Novel Tensor Decomposition of arbitrary order based on Block\n  Convolution with Reflective Boundary Conditions for Multi-Dimensional Data\n  Analysis","summary":"  Tensor decompositions are powerful tools for analyzing multi-dimensional data\nin their original format. Besides tensor decompositions like Tucker and CP,\nTensor SVD (t-SVD) which is based on the t-product of tensors is another\nextension of SVD to tensors that recently developed and has found numerous\napplications in analyzing high dimensional data. This paper offers a new\ninsight into the t-Product and shows that this product is a block convolution\nof two tensors with periodic boundary conditions. Based on this viewpoint, we\npropose a new tensor-tensor product called the $\\star_c{}\\text{-Product}$ based\non Block convolution with reflective boundary conditions. Using a tensor\nframework, this product can be easily extended to tensors of arbitrary order.\nAdditionally, we introduce a tensor decomposition based on our\n$\\star_c{}\\text{-Product}$ for arbitrary order tensors. Compared to t-SVD, our\nnew decomposition has lower complexity, and experiments show that it yields\nhigher-quality results in applications such as classification and compression.\n","authors":["Mahdi Molavi","Mansoor Rezghi","Tayyebeh Saeedi"],"pdf_url":"https://arxiv.org/pdf/2308.01768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01766v1","updated":"2023-08-03T13:56:07Z","published":"2023-08-03T13:56:07Z","title":"PoissonNet: Resolution-Agnostic 3D Shape Reconstruction using Fourier\n  Neural Operators","summary":"  We introduce PoissonNet, an architecture for shape reconstruction that\naddresses the challenge of recovering 3D shapes from points. Traditional deep\nneural networks face challenges with common 3D shape discretization techniques\ndue to their computational complexity at higher resolutions. To overcome this,\nwe leverage Fourier Neural Operators (FNOs) to solve the Poisson equation and\nreconstruct a mesh from oriented point cloud measurements. PoissonNet exhibits\ntwo main advantages. First, it enables efficient training on low-resolution\ndata while achieving comparable performance at high-resolution evaluation,\nthanks to the resolution-agnostic nature of FNOs. This feature allows for\none-shot super-resolution. Second, our method surpasses existing approaches in\nreconstruction quality while being differentiable. Overall, our proposed method\nnot only improves upon the limitations of classical deep neural networks in\nshape reconstruction but also achieves superior results in terms of\nreconstruction quality, running time, and resolution flexibility. Furthermore,\nwe demonstrate that the Poisson surface reconstruction problem is well-posed in\nthe limit case by showing a universal approximation theorem for the solution\noperator of the Poisson equation with distributional data utilizing the Fourier\nNeuronal Operator, which provides a theoretical foundation for our numerical\nresults. The code to reproduce the experiments is available on:\n\\url{https://github.com/arsenal9971/PoissonNet}.\n","authors":["Hector Andrade-Loarca","Aras Bacho","Julius Hege","Gitta Kutyniok"],"pdf_url":"https://arxiv.org/pdf/2308.01766v1.pdf","comment":"Submitted to 3DV 2024"},{"id":"http://arxiv.org/abs/2305.13501v3","updated":"2023-08-03T13:51:22Z","published":"2023-05-22T21:38:06Z","title":"LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On","summary":"  The rapidly evolving fields of e-commerce and metaverse continue to seek\ninnovative approaches to enhance the consumer experience. At the same time,\nrecent advancements in the development of diffusion models have enabled\ngenerative networks to create remarkably realistic images. In this context,\nimage-based virtual try-on, which consists in generating a novel image of a\ntarget model wearing a given in-shop garment, has yet to capitalize on the\npotential of these powerful generative solutions. This work introduces\nLaDI-VTON, the first Latent Diffusion textual Inversion-enhanced model for the\nVirtual Try-ON task. The proposed architecture relies on a latent diffusion\nmodel extended with a novel additional autoencoder module that exploits\nlearnable skip connections to enhance the generation process preserving the\nmodel's characteristics. To effectively maintain the texture and details of the\nin-shop garment, we propose a textual inversion component that can map the\nvisual features of the garment to the CLIP token embedding space and thus\ngenerate a set of pseudo-word token embeddings capable of conditioning the\ngeneration process. Experimental results on Dress Code and VITON-HD datasets\ndemonstrate that our approach outperforms the competitors by a consistent\nmargin, achieving a significant milestone for the task. Source code and trained\nmodels are publicly available at: https://github.com/miccunifi/ladi-vton.\n","authors":["Davide Morelli","Alberto Baldrati","Giuseppe Cartella","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2305.13501v3.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.01760v1","updated":"2023-08-03T13:45:07Z","published":"2023-08-03T13:45:07Z","title":"NuInsSeg: A Fully Annotated Dataset for Nuclei Instance Segmentation in\n  H&E-Stained Histological Images","summary":"  In computational pathology, automatic nuclei instance segmentation plays an\nessential role in whole slide image analysis. While many computerized\napproaches have been proposed for this task, supervised deep learning (DL)\nmethods have shown superior segmentation performances compared to classical\nmachine learning and image processing techniques. However, these models need\nfully annotated datasets for training which is challenging to acquire,\nespecially in the medical domain. In this work, we release one of the biggest\nfully manually annotated datasets of nuclei in Hematoxylin and Eosin\n(H&E)-stained histological images, called NuInsSeg. This dataset contains 665\nimage patches with more than 30,000 manually segmented nuclei from 31 human and\nmouse organs. Moreover, for the first time, we provide additional ambiguous\narea masks for the entire dataset. These vague areas represent the parts of the\nimages where precise and deterministic manual annotations are impossible, even\nfor human experts. The dataset and detailed step-by-step instructions to\ngenerate related segmentation masks are publicly available at\nhttps://www.kaggle.com/datasets/ipateam/nuinsseg and\nhttps://github.com/masih4/NuInsSeg, respectively.\n","authors":["Amirreza Mahbod","Christine Polak","Katharina Feldmann","Rumsha Khan","Katharina Gelles","Georg Dorffner","Ramona Woitek","Sepideh Hatamikia","Isabella Ellinger"],"pdf_url":"https://arxiv.org/pdf/2308.01760v1.pdf","comment":"7 pages, 1 Figure"},{"id":"http://arxiv.org/abs/2303.12360v2","updated":"2023-08-03T13:36:56Z","published":"2023-03-22T07:51:32Z","title":"Automatically Predict Material Properties with Microscopic Image Example\n  Polymer Compatibility","summary":"  Many material properties are manifested in the morphological appearance and\ncharacterized with microscopic image, such as scanning electron microscopy\n(SEM). Polymer miscibility is a key physical quantity of polymer material and\ncommonly and intuitively judged by SEM images. However, human observation and\njudgement for the images is time-consuming, labor-intensive and hard to be\nquantified. Computer image recognition with machine learning method can make up\nthe defects of artificial judging, giving accurate and quantitative judgement.\nWe achieve automatic miscibility recognition utilizing convolution neural\nnetwork and transfer learning method, and the model obtains up to 94% accuracy.\nWe also put forward a quantitative criterion for polymer miscibility with this\nmodel. The proposed method can be widely applied to the quantitative\ncharacterization of the microstructure and properties of various materials.\n","authors":["Zhilong Liang","Zhenzhi Tan","Ruixin Hong","Wanli Ouyang","Jinying Yuan","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.12360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01746v1","updated":"2023-08-03T13:09:59Z","published":"2023-08-03T13:09:59Z","title":"Neural Collapse Terminus: A Unified Solution for Class Incremental\n  Learning and Its Variants","summary":"  How to enable learnability for new classes while keeping the capability well\non old classes has been a crucial challenge for class incremental learning.\nBeyond the normal case, long-tail class incremental learning and few-shot class\nincremental learning are also proposed to consider the data imbalance and data\nscarcity, respectively, which are common in real-world implementations and\nfurther exacerbate the well-known problem of catastrophic forgetting. Existing\nmethods are specifically proposed for one of the three tasks. In this paper, we\noffer a unified solution to the misalignment dilemma in the three tasks.\nConcretely, we propose neural collapse terminus that is a fixed structure with\nthe maximal equiangular inter-class separation for the whole label space. It\nserves as a consistent target throughout the incremental training to avoid\ndividing the feature space incrementally. For CIL and LTCIL, we further propose\na prototype evolving scheme to drive the backbone features into our neural\ncollapse terminus smoothly. Our method also works for FSCIL with only minor\nadaptations. Theoretical analysis indicates that our method holds the neural\ncollapse optimality in an incremental fashion regardless of data imbalance or\ndata scarcity. We also design a generalized case where we do not know the total\nnumber of classes and whether the data distribution is normal, long-tail, or\nfew-shot for each coming session, to test the generalizability of our method.\nExtensive experiments with multiple datasets are conducted to demonstrate the\neffectiveness of our unified solution to all the three tasks and the\ngeneralized case.\n","authors":["Yibo Yang","Haobo Yuan","Xiangtai Li","Jianlong Wu","Lefei Zhang","Zhouchen Lin","Philip Torr","Dacheng Tao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2308.01746v1.pdf","comment":"An extension of our ICLR 2023 paper\n  https://openreview.net/pdf?id=y5W8tpojhtJ. arXiv admin note: text overlap\n  with arXiv:2302.03004"},{"id":"http://arxiv.org/abs/2308.01738v1","updated":"2023-08-03T12:58:23Z","published":"2023-08-03T12:58:23Z","title":"Enhancing Visibility in Nighttime Haze Images Using Guided APSF and\n  Gradient Adaptive Convolution","summary":"  Visibility in hazy nighttime scenes is frequently reduced by multiple\nfactors, including low light, intense glow, light scattering, and the presence\nof multicolored light sources. Existing nighttime dehazing methods often\nstruggle with handling glow or low-light conditions, resulting in either\nexcessively dark visuals or unsuppressed glow outputs. In this paper, we\nenhance the visibility from a single nighttime haze image by suppressing glow\nand enhancing low-light regions. To handle glow effects, our framework learns\nfrom the rendered glow pairs. Specifically, a light source aware network is\nproposed to detect light sources of night images, followed by the APSF (Angular\nPoint Spread Function)-guided glow rendering. Our framework is then trained on\nthe rendered images, resulting in glow suppression. Moreover, we utilize\ngradient-adaptive convolution, to capture edges and textures in hazy scenes. By\nleveraging extracted edges and textures, we enhance the contrast of the scene\nwithout losing important structural details. To boost low-light intensity, our\nnetwork learns an attention map, then adjusted by gamma correction. This\nattention has high values on low-light regions and low values on haze and glow\nregions. Extensive evaluation on real nighttime haze images, demonstrates the\neffectiveness of our method. Our experiments demonstrate that our method\nachieves a PSNR of 30.72dB, outperforming state-of-the-art methods by 14$\\%$ on\nGTA5 nighttime haze dataset. Our data and code is available at:\n\\url{https://github.com/jinyeying/nighttime_dehaze}.\n","authors":["Yeying Jin","Beibei Lin","Wending Yan","Wei Ye","Yuan Yuan","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2308.01738v1.pdf","comment":"Accepted to ACMMM2023, https://github.com/jinyeying/nighttime_dehaze"},{"id":"http://arxiv.org/abs/2303.12678v2","updated":"2023-08-03T12:49:58Z","published":"2023-03-22T16:21:44Z","title":"Uni-Fusion: Universal Continuous Mapping","summary":"  We present Uni-Fusion, a universal continuous mapping framework for surfaces,\nsurface properties (color, infrared, etc.) and more (latent features in CLIP\nembedding space, etc.). We propose the first universal implicit encoding model\nthat supports encoding of both geometry and different types of properties (RGB,\ninfrared, features, etc.) without requiring any training. Based on this, our\nframework divides the point cloud into regular grid voxels and generates a\nlatent feature in each voxel to form a Latent Implicit Map (LIM) for geometries\nand arbitrary properties. Then, by fusing a local LIM frame-wisely into a\nglobal LIM, an incremental reconstruction is achieved. Encoded with\ncorresponding types of data, our Latent Implicit Map is capable of generating\ncontinuous surfaces, surface property fields, surface feature fields, and all\nother possible options. To demonstrate the capabilities of our model, we\nimplement three applications: (1) incremental reconstruction for surfaces and\ncolor (2) 2D-to-3D transfer of fabricated properties (3) open-vocabulary scene\nunderstanding by creating a text CLIP feature field on surfaces. We evaluate\nUni-Fusion by comparing it in corresponding applications, from which Uni-Fusion\nshows high-flexibility in various applications while performing best or being\ncompetitive. The project page of Uni-Fusion is available at\nhttps://jarrome.github.io/Uni-Fusion/ .\n","authors":["Yijun Yuan","Andreas Nuechter"],"pdf_url":"https://arxiv.org/pdf/2303.12678v2.pdf","comment":"Project page: https://jarrome.github.io/Uni-Fusion/"},{"id":"http://arxiv.org/abs/2211.13755v2","updated":"2023-08-03T12:48:01Z","published":"2022-11-24T18:58:31Z","title":"TemporalStereo: Efficient Spatial-Temporal Stereo Matching Network","summary":"  We present TemporalStereo, a coarse-to-fine stereo matching network that is\nhighly efficient, and able to effectively exploit the past geometry and context\ninformation to boost matching accuracy. Our network leverages sparse cost\nvolume and proves to be effective when a single stereo pair is given. However,\nits peculiar ability to use spatio-temporal information across stereo sequences\nallows TemporalStereo to alleviate problems such as occlusions and reflective\nregions while enjoying high efficiency also in this latter case. Notably, our\nmodel -- trained once with stereo videos -- can run in both single-pair and\ntemporal modes seamlessly. Experiments show that our network relying on camera\nmotion is robust even to dynamic objects when running on videos. We validate\nTemporalStereo through extensive experiments on synthetic (SceneFlow,\nTartanAir) and real (KITTI 2012, KITTI 2015) datasets. Our model achieves\nstate-of-the-art performance on any of these datasets. Code is available at\n\\url{https://github.com/youmi-zym/TemporalStereo.git}.\n","authors":["Youmin Zhang","Matteo Poggi","Stefano Mattoccia"],"pdf_url":"https://arxiv.org/pdf/2211.13755v2.pdf","comment":"Accepted by IROS 2023, Project page:\n  https://youmi-zym.github.io/projects/TemporalStereo/"},{"id":"http://arxiv.org/abs/2307.10123v2","updated":"2023-08-03T12:45:20Z","published":"2023-07-19T16:42:52Z","title":"Two Approaches to Supervised Image Segmentation","summary":"  Though performed almost effortlessly by humans, segmenting 2D gray-scale or\ncolor images in terms of regions of interest (e.g.~background, objects, or\nportions of objects) constitutes one of the greatest challenges in science and\ntechnology as a consequence of the involved dimensionality reduction(3D to 2D),\nnoise, reflections, shades, and occlusions, among many other possible effects.\nWhile a large number of interesting related approaches have been suggested\nalong the last decades, it was mainly thanks to the recent development of deep\nlearning that more effective and general solutions have been obtained,\ncurrently constituting the basic comparison reference for this type of\noperation. Also developed recently, a multiset-based methodology has been\ndescribed that is capable of encouraging image segmentation performance while\ncombining spatial accuracy, stability, and robustness while requiring little\ncomputational resources (hardware and/or training and recognition time). The\ninteresting features of the multiset neurons methodology mostly follow from the\nenhanced selectivity and sensitivity, as well as good robustness to data\nperturbations and outliers, allowed by the coincidence similarity index on\nwhich the multiset approach to supervised image segmentation is based. After\ndescribing the deep learning and multiset neurons approaches, the present work\ndevelops two comparison experiments between them which are primarily aimed at\nillustrating their respective main interesting features when applied to the\nadopted specific type of data and parameter configurations. While the deep\nlearning approach confirmed its potential for performing image segmentation,\nthe alternative multiset methodology allowed for enhanced accuracy while\nrequiring little computational resources.\n","authors":["Alexandre Benatti","Luciano da F. Costa"],"pdf_url":"https://arxiv.org/pdf/2307.10123v2.pdf","comment":"38 pages, 18 figures"},{"id":"http://arxiv.org/abs/2308.01731v1","updated":"2023-08-03T12:43:21Z","published":"2023-08-03T12:43:21Z","title":"Quantification of Predictive Uncertainty via Inference-Time Sampling","summary":"  Predictive variability due to data ambiguities has typically been addressed\nvia construction of dedicated models with built-in probabilistic capabilities\nthat are trained to predict uncertainty estimates as variables of interest.\nThese approaches require distinct architectural components and training\nmechanisms, may include restrictive assumptions and exhibit overconfidence,\ni.e., high confidence in imprecise predictions. In this work, we propose a\npost-hoc sampling strategy for estimating predictive uncertainty accounting for\ndata ambiguity. The method can generate different plausible outputs for a given\ninput and does not assume parametric forms of predictive distributions. It is\narchitecture agnostic and can be applied to any feed-forward deterministic\nnetwork without changes to the architecture or training procedure. Experiments\non regression tasks on imaging and non-imaging input data show the method's\nability to generate diverse and multi-modal predictive distributions, and a\ndesirable correlation of the estimated uncertainty with the prediction error.\n","authors":["Katarína Tóthová","Ľubor Ladický","Daniel Thul","Marc Pollefeys","Ender Konukoglu"],"pdf_url":"https://arxiv.org/pdf/2308.01731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11067v2","updated":"2023-08-03T12:37:07Z","published":"2023-07-20T17:46:21Z","title":"CNOS: A Strong Baseline for CAD-based Novel Object Segmentation","summary":"  We propose a simple three-stage approach to segment unseen objects in RGB\nimages using their CAD models. Leveraging recent powerful foundation models,\nDINOv2 and Segment Anything, we create descriptors and generate proposals,\nincluding binary masks for a given input RGB image. By matching proposals with\nreference descriptors created from CAD models, we achieve precise object ID\nassignment along with modal masks. We experimentally demonstrate that our\nmethod achieves state-of-the-art results in CAD-based novel object\nsegmentation, surpassing existing approaches on the seven core datasets of the\nBOP challenge by 19.8% AP using the same BOP evaluation protocol. Our source\ncode is available at https://github.com/nv-nguyen/cnos.\n","authors":["Van Nguyen Nguyen","Tomas Hodan","Georgy Ponimatkin","Thibault Groueix","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2307.11067v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01721v1","updated":"2023-08-03T12:30:52Z","published":"2023-08-03T12:30:52Z","title":"Weakly Supervised 3D Instance Segmentation without Instance-level\n  Annotations","summary":"  3D semantic scene understanding tasks have achieved great success with the\nemergence of deep learning, but often require a huge amount of manually\nannotated training data. To alleviate the annotation cost, we propose the first\nweakly-supervised 3D instance segmentation method that only requires\ncategorical semantic labels as supervision, and we do not need instance-level\nlabels. The required semantic annotations can be either dense or extreme sparse\n(e.g. 0.02% of total points). Even without having any instance-related\nground-truth, we design an approach to break point clouds into raw fragments\nand find the most confident samples for learning instance centroids.\nFurthermore, we construct a recomposed dataset using pseudo instances, which is\nused to learn our defined multilevel shape-aware objectness signal. An\nasymmetrical object inference algorithm is followed to process core points and\nboundary points with different strategies, and generate high-quality pseudo\ninstance labels to guide iterative training. Experiments demonstrate that our\nmethod can achieve comparable results with recent fully supervised methods. By\ngenerating pseudo instance labels from categorical semantic labels, our\ndesigned approach can also assist existing methods for learning 3D instance\nsegmentation at reduced annotation cost.\n","authors":["Shichao Dong","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2308.01721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13061v2","updated":"2023-08-03T12:05:49Z","published":"2022-11-23T15:57:16Z","title":"A Masked Face Classification Benchmark on Low-Resolution Surveillance\n  Images","summary":"  We propose a novel image dataset focused on tiny faces wearing face masks for\nmask classification purposes, dubbed Small Face MASK (SF-MASK), composed of a\ncollection made from 20k low-resolution images exported from diverse and\nheterogeneous datasets, ranging from 7 x 7 to 64 x 64 pixel resolution. An\naccurate visualization of this collection, through counting grids, made it\npossible to highlight gaps in the variety of poses assumed by the heads of the\npedestrians. In particular, faces filmed by very high cameras, in which the\nfacial features appear strongly skewed, are absent. To address this structural\ndeficiency, we produced a set of synthetic images which resulted in a\nsatisfactory covering of the intra-class variance. Furthermore, a small\nsubsample of 1701 images contains badly worn face masks, opening to multi-class\nclassification challenges. Experiments on SF-MASK focus on face mask\nclassification using several classifiers. Results show that the richness of\nSF-MASK (real + synthetic images) leads all of the tested classifiers to\nperform better than exploiting comparative face mask datasets, on a fixed 1077\nimages testing set. Dataset and evaluation code are publicly available here:\nhttps://github.com/HumaticsLAB/sf-mask\n","authors":["Federico Cunico","Andrea Toaiari","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2211.13061v2.pdf","comment":"15 pages, 7 figures. Accepted at T-CAP workshop @ ICPR 2022"},{"id":"http://arxiv.org/abs/2303.08757v2","updated":"2023-08-03T12:00:48Z","published":"2023-03-15T16:53:19Z","title":"CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in\n  Patient With Suspected Ischemic Stroke","summary":"  Precise and fast prediction methods for ischemic areas comprised of dead\ntissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)\npatients are of significant clinical interest. They play an essential role in\nimproving diagnosis and treatment planning. Computed Tomography (CT) scan is\none of the primary modalities for early assessment in patients with suspected\nAIS. CT Perfusion (CTP) is often used as a primary assessment to determine\nstroke location, severity, and volume of ischemic lesions. Current automatic\nsegmentation methods for CTP mostly use already processed 3D parametric maps\nconventionally used for clinical interpretation by radiologists as input.\nAlternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time\ninput, where the spatial information over the volume is ignored. In addition,\nthese methods are only interested in segmenting core regions, while predicting\npenumbra can be essential for treatment planning. This paper investigates\ndifferent methods to utilize the entire 4D CTP as input to fully exploit the\nspatio-temporal information, leading us to propose a novel 4D convolution\nlayer. Our comprehensive experiments on a local dataset of 152 patients divided\ninto three groups show that our proposed models generate more precise results\nthan other methods explored. Adopting the proposed 4D mJ-Net, a Dice\nCoefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core\nareas, respectively. The code is available on\nhttps://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.\n","authors":["Luca Tomasetti","Kjersti Engan","Liv Jorunn Høllesli","Kathinka Dæhli Kurz","Mahdieh Khanmohammadi"],"pdf_url":"https://arxiv.org/pdf/2303.08757v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.07901v3","updated":"2023-08-03T11:36:06Z","published":"2022-02-16T07:09:04Z","title":"Auxiliary Cross-Modal Representation Learning with Triplet Loss\n  Functions for Online Handwriting Recognition","summary":"  Cross-modal representation learning learns a shared embedding between two or\nmore modalities to improve performance in a given task compared to using only\none of the modalities. Cross-modal representation learning from different data\ntypes -- such as images and time-series data (e.g., audio or text data) --\nrequires a deep metric learning loss that minimizes the distance between the\nmodality embeddings. In this paper, we propose to use the contrastive or\ntriplet loss, which uses positive and negative identities to create sample\npairs with different labels, for cross-modal representation learning between\nimage and time-series modalities (CMR-IS). By adapting the triplet loss for\ncross-modal representation learning, higher accuracy in the main (time-series\nclassification) task can be achieved by exploiting additional information of\nthe auxiliary (image classification) task. We present a triplet loss with a\ndynamic margin for single label and sequence-to-sequence classification tasks.\nWe perform extensive evaluations on synthetic image and time-series data, and\non data for offline handwriting recognition (HWR) and on online HWR from\nsensor-enhanced pens for classifying written words. Our experiments show an\nimproved classification accuracy, faster convergence, and better\ngeneralizability due to an improved cross-modal representation. Furthermore,\nthe more suitable generalizability leads to a better adaptability between\nwriters for online HWR.\n","authors":["Felix Ott","David Rügamer","Lucas Heublein","Bernd Bischl","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2202.07901v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12844v2","updated":"2023-08-03T11:35:40Z","published":"2021-10-25T12:13:45Z","title":"Reconstructing Pruned Filters using Cheap Spatial Transformations","summary":"  We present an efficient alternative to the convolutional layer using cheap\nspatial transformations. This construction exploits an inherent spatial\nredundancy of the learned convolutional filters to enable a much greater\nparameter efficiency, while maintaining the top-end accuracy of their dense\ncounter-parts. Training these networks is modelled as a generalised pruning\nproblem, whereby the pruned filters are replaced with cheap transformations\nfrom the set of non-pruned filters. We provide an efficient implementation of\nthe proposed layer, followed by two natural extensions to avoid excessive\nfeature compression and to improve the expressivity of the transformed\nfeatures. We show that these networks can achieve comparable or improved\nperformance to state-of-the-art pruning models across both the CIFAR-10 and\nImageNet-1K datasets.\n","authors":["Roy Miles","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2110.12844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01700v1","updated":"2023-08-03T11:34:11Z","published":"2023-08-03T11:34:11Z","title":"Bees Local Phase Quantization Feature Selection for RGB-D Facial\n  Expressions Recognition","summary":"  Feature selection could be defined as an optimization problem and solved by\nbio-inspired algorithms. Bees Algorithm (BA) shows decent performance in\nfeature selection optimization tasks. On the other hand, Local Phase\nQuantization (LPQ) is a frequency domain feature which has excellent\nperformance on Depth images. Here, after extracting LPQ features out of RGB\n(colour) and Depth images from the Iranian Kinect Face Database (IKFDB), the\nBees feature selection algorithm applies to select the desired number of\nfeatures for final classification tasks. IKFDB is recorded with Kinect sensor\nV.2 and contains colour and depth images for facial and facial\nmicro-expressions recognition purposes. Here five facial expressions of Anger,\nJoy, Surprise, Disgust and Fear are used for final validation. The proposed\nBees LPQ method is compared with Particle Swarm Optimization (PSO) LPQ, PCA\nLPQ, Lasso LPQ, and just LPQ features for classification tasks with Support\nVector Machines (SVM), K-Nearest Neighbourhood (KNN), Shallow Neural Network\nand Ensemble Subspace KNN. Returned results, show a decent performance of the\nproposed algorithm (99 % accuracy) in comparison with others.\n","authors":["Seyed Muhammad Hossein Mousavi","Atiye Ilanloo"],"pdf_url":"https://arxiv.org/pdf/2308.01700v1.pdf","comment":"The International Workshop on the Bees Algorithm and its\n  Applications, Birmingham, UK\n  (https://sites.google.com/view/baaworkshop/baa-past-events/2022)"},{"id":"http://arxiv.org/abs/2308.01698v1","updated":"2023-08-03T11:33:50Z","published":"2023-08-03T11:33:50Z","title":"Balanced Destruction-Reconstruction Dynamics for Memory-replay Class\n  Incremental Learning","summary":"  Class incremental learning (CIL) aims to incrementally update a trained model\nwith the new classes of samples (plasticity) while retaining previously learned\nability (stability). To address the most challenging issue in this goal, i.e.,\ncatastrophic forgetting, the mainstream paradigm is memory-replay CIL, which\nconsolidates old knowledge by replaying a small number of old classes of\nsamples saved in the memory. Despite effectiveness, the inherent\ndestruction-reconstruction dynamics in memory-replay CIL are an intrinsic\nlimitation: if the old knowledge is severely destructed, it will be quite hard\nto reconstruct the lossless counterpart. Our theoretical analysis shows that\nthe destruction of old knowledge can be effectively alleviated by balancing the\ncontribution of samples from the current phase and those saved in the memory.\nMotivated by this theoretical finding, we propose a novel Balanced\nDestruction-Reconstruction module (BDR) for memory-replay CIL, which can\nachieve better knowledge reconstruction by reducing the degree of maximal\ndestruction of old knowledge. Specifically, to achieve a better balance between\nold knowledge and new classes, the proposed BDR module takes into account two\nfactors: the variance in training status across different classes and the\nquantity imbalance of samples from the current phase and memory. By dynamically\nmanipulating the gradient during training based on these factors, BDR can\neffectively alleviate knowledge destruction and improve knowledge\nreconstruction. Extensive experiments on a range of CIL benchmarks have shown\nthat as a lightweight plug-and-play module, BDR can significantly improve the\nperformance of existing state-of-the-art methods with good generalization.\n","authors":["Yuhang Zhou","Jiangchao Yao","Feng Hong","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00679v3","updated":"2023-08-03T11:19:59Z","published":"2023-05-01T06:21:35Z","title":"Enhanced Multi-level Features for Very High Resolution Remote Sensing\n  Scene Classification","summary":"  Very high-resolution (VHR) remote sensing (RS) scene classification is a\nchallenging task due to the higher inter-class similarity and intra-class\nvariability problems. Recently, the existing deep learning (DL)-based methods\nhave shown great promise in VHR RS scene classification. However, they still\nprovide an unstable classification performance. To address such a problem, we,\nin this letter, propose a novel DL-based approach. For this, we devise an\nenhanced VHR attention module (EAM), followed by the atrous spatial pyramid\npooling (ASPP) and global average pooling (GAP). This procedure imparts the\nenhanced features from the corresponding level. Then, the multi-level feature\nfusion is performed. Experimental results on two widely-used VHR RS datasets\nshow that the proposed approach yields a competitive and stable/robust\nclassification performance with the least standard deviation of 0.001. Further,\nthe highest overall accuracies on the AID and the NWPU datasets are 95.39% and\n93.04%, respectively.\n","authors":["Chiranjibi Sitaula","Sumesh KC","Jagannath Aryal"],"pdf_url":"https://arxiv.org/pdf/2305.00679v3.pdf","comment":"This paper has been submitted to the journal for peer review. Based\n  on the journal's policy and restrictions, this version may be updated or\n  deleted"},{"id":"http://arxiv.org/abs/2308.01686v1","updated":"2023-08-03T10:57:58Z","published":"2023-08-03T10:57:58Z","title":"LiDAR-Camera Panoptic Segmentation via Geometry-Consistent and\n  Semantic-Aware Alignment","summary":"  3D panoptic segmentation is a challenging perception task that requires both\nsemantic segmentation and instance segmentation. In this task, we notice that\nimages could provide rich texture, color, and discriminative information, which\ncan complement LiDAR data for evident performance improvement, but their fusion\nremains a challenging problem. To this end, we propose LCPS, the first\nLiDAR-Camera Panoptic Segmentation network. In our approach, we conduct\nLiDAR-Camera fusion in three stages: 1) an Asynchronous Compensation Pixel\nAlignment (ACPA) module that calibrates the coordinate misalignment caused by\nasynchronous problems between sensors; 2) a Semantic-Aware Region Alignment\n(SARA) module that extends the one-to-one point-pixel mapping to one-to-many\nsemantic relations; 3) a Point-to-Voxel feature Propagation (PVP) module that\nintegrates both geometric and semantic fusion information for the entire point\ncloud. Our fusion strategy improves about 6.9% PQ performance over the\nLiDAR-only baseline on NuScenes dataset. Extensive quantitative and qualitative\nexperiments further demonstrate the effectiveness of our novel framework. The\ncode will be released at https://github.com/zhangzw12319/lcps.git.\n","authors":["Zhiwei Zhang","Zhizhong Zhang","Qian Yu","Ran Yi","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2308.01686v1.pdf","comment":"Accepted as ICCV 2023 paper"},{"id":"http://arxiv.org/abs/2308.01661v1","updated":"2023-08-03T09:56:31Z","published":"2023-08-03T09:56:31Z","title":"BEVControl: Accurately Controlling Street-view Elements with\n  Multi-perspective Consistency via BEV Sketch Layout","summary":"  Using synthesized images to boost the performance of perception models is a\nlong-standing research challenge in computer vision. It becomes more eminent in\nvisual-centric autonomous driving systems with multi-view cameras as some\nlong-tail scenarios can never be collected. Guided by the BEV segmentation\nlayouts, the existing generative networks seem to synthesize photo-realistic\nstreet-view images when evaluated solely on scene-level metrics. However, once\nzoom-in, they usually fail to produce accurate foreground and background\ndetails such as heading. To this end, we propose a two-stage generative method,\ndubbed BEVControl, that can generate accurate foreground and background\ncontents. In contrast to segmentation-like input, it also supports sketch style\ninput, which is more flexible for humans to edit. In addition, we propose a\ncomprehensive multi-level evaluation protocol to fairly compare the quality of\nthe generated scene, foreground object, and background geometry. Our extensive\nexperiments show that our BEVControl surpasses the state-of-the-art method,\nBEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation\nmIoU. In addition, we show that using images generated by BEVControl to train\nthe downstream perception model, it achieves on average 1.29 improvement in NDS\nscore.\n","authors":["Kairui Yang","Enhui Ma","Jibin Peng","Qing Guo","Di Lin","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01661v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2303.05118v3","updated":"2023-08-03T09:47:46Z","published":"2023-03-09T08:57:01Z","title":"SLCA: Slow Learner with Classifier Alignment for Continual Learning on a\n  Pre-trained Model","summary":"  The goal of continual learning is to improve the performance of recognition\nmodels in learning sequentially arrived data. Although most existing works are\nestablished on the premise of learning from scratch, growing efforts have been\ndevoted to incorporating the benefits of pre-training. However, how to\nadaptively exploit the pre-trained knowledge for each incremental task while\nmaintaining its generalizability remains an open question. In this work, we\npresent an extensive analysis for continual learning on a pre-trained model\n(CLPM), and attribute the key challenge to a progressive overfitting problem.\nObserving that selectively reducing the learning rate can almost resolve this\nissue in the representation layer, we propose a simple but extremely effective\napproach named Slow Learner with Classifier Alignment (SLCA), which further\nimproves the classification layer by modeling the class-wise distributions and\naligning the classification layers in a post-hoc fashion. Across a variety of\nscenarios, our proposal provides substantial improvements for CLPM (e.g., up to\n49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split\nCUB-200 and Split Cars-196, respectively), and thus outperforms\nstate-of-the-art approaches by a large margin. Based on such a strong baseline,\ncritical factors and promising directions are analyzed in-depth to facilitate\nsubsequent research. Code has been made available at:\nhttps://github.com/GengDavid/SLCA.\n","authors":["Gengwei Zhang","Liyuan Wang","Guoliang Kang","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2303.05118v3.pdf","comment":"Accepted by ICCV 2023, code released"},{"id":"http://arxiv.org/abs/2212.04385v2","updated":"2023-08-03T09:39:00Z","published":"2022-12-08T16:27:54Z","title":"BEVBert: Multimodal Map Pre-training for Language-guided Navigation","summary":"  Large-scale pre-training has shown promising results on the\nvision-and-language navigation (VLN) task. However, most existing pre-training\nmethods employ discrete panoramas to learn visual-textual associations. This\nrequires the model to implicitly correlate incomplete, duplicate observations\nwithin the panoramas, which may impair an agent's spatial understanding. Thus,\nwe propose a new map-based pre-training paradigm that is spatial-aware for use\nin VLN. Concretely, we build a local metric map to explicitly aggregate\nincomplete observations and remove duplicates, while modeling navigation\ndependency in a global topological map. This hybrid design can balance the\ndemand of VLN for both short-term reasoning and long-term planning. Then, based\non the hybrid map, we devise a pre-training framework to learn a multimodal map\nrepresentation, which enhances spatial-aware cross-modal reasoning thereby\nfacilitating the language-guided navigation goal. Extensive experiments\ndemonstrate the effectiveness of the map-based pre-training route for VLN, and\nthe proposed method achieves state-of-the-art on four VLN benchmarks.\n","authors":["Dong An","Yuankai Qi","Yangguang Li","Yan Huang","Liang Wang","Tieniu Tan","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2212.04385v2.pdf","comment":"ICCV 2023, project page: https://github.com/MarSaKi/VLN-BEVBert"},{"id":"http://arxiv.org/abs/2308.01655v1","updated":"2023-08-03T09:38:35Z","published":"2023-08-03T09:38:35Z","title":"DiffColor: Toward High Fidelity Text-Guided Image Colorization with\n  Diffusion Models","summary":"  Recent data-driven image colorization methods have enabled automatic or\nreference-based colorization, while still suffering from unsatisfactory and\ninaccurate object-level color control. To address these issues, we propose a\nnew method called DiffColor that leverages the power of pre-trained diffusion\nmodels to recover vivid colors conditioned on a prompt text, without any\nadditional inputs. DiffColor mainly contains two stages: colorization with\ngenerative color prior and in-context controllable colorization. Specifically,\nwe first fine-tune a pre-trained text-to-image model to generate colorized\nimages using a CLIP-based contrastive loss. Then we try to obtain an optimized\ntext embedding aligning the colorized image and the text prompt, and a\nfine-tuned diffusion model enabling high-quality image reconstruction. Our\nmethod can produce vivid and diverse colors with a few iterations, and keep the\nstructure and background intact while having colors well-aligned with the\ntarget language guidance. Moreover, our method allows for in-context\ncolorization, i.e., producing different colorization results by modifying\nprompt texts without any fine-tuning, and can achieve object-level controllable\ncolorization results. Extensive experiments and user studies demonstrate that\nDiffColor outperforms previous works in terms of visual quality, color\nfidelity, and diversity of colorization options.\n","authors":["Jianxin Lin","Peng Xiao","Yijun Wang","Rongju Zhang","Xiangxiang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.01655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14073v2","updated":"2023-08-03T09:34:24Z","published":"2023-07-26T09:50:44Z","title":"VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by\n  Using Diffusion Model with ControlNet","summary":"  Recently, diffusion models like StableDiffusion have achieved impressive\nimage generation results. However, the generation process of such diffusion\nmodels is uncontrollable, which makes it hard to generate videos with\ncontinuous and consistent content. In this work, by using the diffusion model\nwith ControlNet, we proposed a new motion-guided video-to-video translation\nframework called VideoControlNet to generate various videos based on the given\nprompts and the condition from the input video. Inspired by the video codecs\nthat use motion information for reducing temporal redundancy, our framework\nuses motion information to prevent the regeneration of the redundant areas for\ncontent consistency. Specifically, we generate the first frame (i.e., the\nI-frame) by using the diffusion model with ControlNet. Then we generate other\nkey frames (i.e., the P-frame) based on the previous I/P-frame by using our\nnewly proposed motion-guided P-frame generation (MgPG) method, in which the\nP-frames are generated based on the motion information and the occlusion areas\nare inpainted by using the diffusion model. Finally, the rest frames (i.e., the\nB-frame) are generated by using our motion-guided B-frame interpolation (MgBI)\nmodule. Our experiments demonstrate that our proposed VideoControlNet inherits\nthe generation capability of the pre-trained large diffusion model and extends\nthe image diffusion model to the video diffusion model by using motion\ninformation. More results are provided at our project page.\n","authors":["Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2307.14073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12726v2","updated":"2023-08-03T09:26:36Z","published":"2023-05-22T05:20:23Z","title":"Towards Explainable In-the-Wild Video Quality Assessment: A Database and\n  a Language-Prompted Approach","summary":"  The proliferation of in-the-wild videos has greatly expanded the Video\nQuality Assessment (VQA) problem. Unlike early definitions that usually focus\non limited distortion types, VQA on in-the-wild videos is especially\nchallenging as it could be affected by complicated factors, including various\ndistortions and diverse contents. Though subjective studies have collected\noverall quality scores for these videos, how the abstract quality scores relate\nwith specific factors is still obscure, hindering VQA methods from more\nconcrete quality evaluations (e.g. sharpness of a video). To solve this\nproblem, we collect over two million opinions on 4,543 in-the-wild videos on 13\ndimensions of quality-related factors, including in-capture authentic\ndistortions (e.g. motion blur, noise, flicker), errors introduced by\ncompression and transmission, and higher-level experiences on semantic contents\nand aesthetic issues (e.g. composition, camera trajectory), to establish the\nmulti-dimensional Maxwell database. Specifically, we ask the subjects to label\namong a positive, a negative, and a neutral choice for each dimension. These\nexplanation-level opinions allow us to measure the relationships between\nspecific quality factors and abstract subjective quality ratings, and to\nbenchmark different categories of VQA algorithms on each dimension, so as to\nmore comprehensively analyze their strengths and weaknesses. Furthermore, we\npropose the MaxVQA, a language-prompted VQA approach that modifies\nvision-language foundation model CLIP to better capture important quality\nissues as observed in our analyses. The MaxVQA can jointly evaluate various\nspecific quality factors and final quality scores with state-of-the-art\naccuracy on all dimensions, and superb generalization ability on existing\ndatasets. Code and data available at https://github.com/VQAssessment/MaxVQA.\n","authors":["Haoning Wu","Erli Zhang","Liang Liao","Chaofeng Chen","Jingwen Hou","Annan Wang","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2305.12726v2.pdf","comment":"Proceedings of the 31st ACM International Conference on Multimedia\n  (MM '23)"},{"id":"http://arxiv.org/abs/2303.11098v2","updated":"2023-08-03T09:22:34Z","published":"2023-03-20T13:33:31Z","title":"A closer look at the training dynamics of knowledge distillation","summary":"  In this paper we revisit the efficacy of knowledge distillation as a function\nmatching and metric learning problem. In doing so we verify three important\ndesign decisions, namely the normalisation, soft maximum function, and\nprojection layers as key ingredients. We theoretically show that the projector\nimplicitly encodes information on past examples, enabling relational gradients\nfor the student. We then show that the normalisation of representations is\ntightly coupled with the training dynamics of this projector, which can have a\nlarge impact on the students performance. Finally, we show that a simple soft\nmaximum function can be used to address any significant capacity gap problems.\nExperimental results on various benchmark datasets demonstrate that using these\ninsights can lead to superior or comparable performance to state-of-the-art\nknowledge distillation techniques, despite being much more computationally\nefficient. In particular, we obtain these results across image classification\n(CIFAR100 and ImageNet), object detection (COCO2017), and on more difficult\ndistillation objectives, such as training data efficient transformers, whereby\nwe attain a 77.2% top-1 accuracy with DeiT-Ti on ImageNet.\n","authors":["Roy Miles","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2303.11098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09347v4","updated":"2023-08-03T09:21:47Z","published":"2023-04-18T23:54:20Z","title":"Dual Stage Stylization Modulation for Domain Generalized Semantic\n  Segmentation","summary":"  Obtaining sufficient labeled data for training deep models is often\nchallenging in real-life applications. To address this issue, we propose a\nnovel solution for single-source domain generalized semantic segmentation.\nRecent approaches have explored data diversity enhancement using hallucination\ntechniques. However, excessive hallucination can degrade performance,\nparticularly for imbalanced datasets. As shown in our experiments, minority\nclasses are more susceptible to performance reduction due to hallucination\ncompared to majority classes. To tackle this challenge, we introduce a\ndual-stage Feature Transform (dFT) layer within the Adversarial Semantic\nHallucination+ (ASH+) framework. The ASH+ framework performs a dual-stage\nmanipulation of hallucination strength. By leveraging semantic information for\neach pixel, our approach adaptively adjusts the pixel-wise hallucination\nstrength, thus providing fine-grained control over hallucination. We validate\nthe effectiveness of our proposed method through comprehensive experiments on\npublicly available semantic segmentation benchmark datasets (Cityscapes and\nSYNTHIA). Quantitative and qualitative comparisons demonstrate that our\napproach is competitive with state-of-the-art methods for the Cityscapes\ndataset and surpasses existing solutions for the SYNTHIA dataset. Code for our\nframework will be made readily available to the research community.\n","authors":["Gabriel Tjio","Ping Liu","Chee-Keong Kwoh","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.09347v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01639v1","updated":"2023-08-03T09:16:57Z","published":"2023-08-03T09:16:57Z","title":"Multi-scale Cross-restoration Framework for Electrocardiogram Anomaly\n  Detection","summary":"  Electrocardiogram (ECG) is a widely used diagnostic tool for detecting heart\nconditions. Rare cardiac diseases may be underdiagnosed using traditional ECG\nanalysis, considering that no training dataset can exhaust all possible cardiac\ndisorders. This paper proposes using anomaly detection to identify any\nunhealthy status, with normal ECGs solely for training. However, detecting\nanomalies in ECG can be challenging due to significant inter-individual\ndifferences and anomalies present in both global rhythm and local morphology.\nTo address this challenge, this paper introduces a novel multi-scale\ncross-restoration framework for ECG anomaly detection and localization that\nconsiders both local and global ECG characteristics. The proposed framework\nemploys a two-branch autoencoder to facilitate multi-scale feature learning\nthrough a masking and restoration process, with one branch focusing on global\nfeatures from the entire ECG and the other on local features from\nheartbeat-level details, mimicking the diagnostic process of cardiologists.\nAnomalies are identified by their high restoration errors. To evaluate the\nperformance on a large number of individuals, this paper introduces a new\nchallenging benchmark with signal point-level ground truths annotated by\nexperienced cardiologists. The proposed method demonstrates state-of-the-art\nperformance on this benchmark and two other well-known ECG datasets. The\nbenchmark dataset and source code are available at:\n\\url{https://github.com/MediaBrain-SJTU/ECGAD}\n","authors":["Aofan Jiang","Chaoqin Huang","Qing Cao","Shuang Wu","Zi Zeng","Kang Chen","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01639v1.pdf","comment":"MICCAI 2023 Early Accept"},{"id":"http://arxiv.org/abs/2308.01634v1","updated":"2023-08-03T09:09:28Z","published":"2023-08-03T09:09:28Z","title":"Disentangling Multi-view Representations Beyond Inductive Bias","summary":"  Multi-view (or -modality) representation learning aims to understand the\nrelationships between different view representations. Existing methods\ndisentangle multi-view representations into consistent and view-specific\nrepresentations by introducing strong inductive biases, which can limit their\ngeneralization ability. In this paper, we propose a novel multi-view\nrepresentation disentangling method that aims to go beyond inductive biases,\nensuring both interpretability and generalizability of the resulting\nrepresentations. Our method is based on the observation that discovering\nmulti-view consistency in advance can determine the disentangling information\nboundary, leading to a decoupled learning objective. We also found that the\nconsistency can be easily extracted by maximizing the transformation invariance\nand clustering consistency between views. These observations drive us to\npropose a two-stage framework. In the first stage, we obtain multi-view\nconsistency by training a consistent encoder to produce semantically-consistent\nrepresentations across views as well as their corresponding pseudo-labels. In\nthe second stage, we disentangle specificity from comprehensive representations\nby minimizing the upper bound of mutual information between consistent and\ncomprehensive representations. Finally, we reconstruct the original data by\nconcatenating pseudo-labels and view-specific representations. Our experiments\non four multi-view datasets demonstrate that our proposed method outperforms 12\ncomparison methods in terms of clustering and classification performance. The\nvisualization results also show that the extracted consistency and specificity\nare compact and interpretable. Our code can be found at\n\\url{https://github.com/Guanzhou-Ke/DMRIB}.\n","authors":["Guanzhou Ke","Yang Yu","Guoqing Chao","Xiaoli Wang"," Chenyang"," Xu","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.01634v1.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.01630v1","updated":"2023-08-03T09:04:48Z","published":"2023-08-03T09:04:48Z","title":"Erasure-based Interaction Network for RGBT Video Object Detection and A\n  Unified Benchmark","summary":"  Recently, many breakthroughs are made in the field of Video Object Detection\n(VOD), but the performance is still limited due to the imaging limitations of\nRGB sensors in adverse illumination conditions. To alleviate this issue, this\nwork introduces a new computer vision task called RGB-thermal (RGBT) VOD by\nintroducing the thermal modality that is insensitive to adverse illumination\nconditions. To promote the research and development of RGBT VOD, we design a\nnovel Erasure-based Interaction Network (EINet) and establish a comprehensive\nbenchmark dataset (VT-VOD50) for this task. Traditional VOD methods often\nleverage temporal information by using many auxiliary frames, and thus have\nlarge computational burden. Considering that thermal images exhibit less noise\nthan RGB ones, we develop a negative activation function that is used to erase\nthe noise of RGB features with the help of thermal image features. Furthermore,\nwith the benefits from thermal images, we rely only on a small temporal window\nto model the spatio-temporal information to greatly improve efficiency while\nmaintaining detection accuracy.\n  VT-VOD50 dataset consists of 50 pairs of challenging RGBT video sequences\nwith complex backgrounds, various objects and different illuminations, which\nare collected in real traffic scenarios. Extensive experiments on VT-VOD50\ndataset demonstrate the effectiveness and efficiency of our proposed method\nagainst existing mainstream VOD methods. The code of EINet and the dataset will\nbe released to the public for free academic usage.\n","authors":["Zhengzheng Tu","Qishun Wang","Hongshun Wang","Kunpeng Wang","Chenglong Li"],"pdf_url":"https://arxiv.org/pdf/2308.01630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02796v3","updated":"2023-08-03T09:03:30Z","published":"2022-12-06T07:22:20Z","title":"DiffuPose: Monocular 3D Human Pose Estimation via Denoising Diffusion\n  Probabilistic Model","summary":"  Thanks to the development of 2D keypoint detectors, monocular 3D human pose\nestimation (HPE) via 2D-to-3D uplifting approaches have achieved remarkable\nimprovements. Still, monocular 3D HPE is a challenging problem due to the\ninherent depth ambiguities and occlusions. To handle this problem, many\nprevious works exploit temporal information to mitigate such difficulties.\nHowever, there are many real-world applications where frame sequences are not\naccessible. This paper focuses on reconstructing a 3D pose from a single 2D\nkeypoint detection. Rather than exploiting temporal information, we alleviate\nthe depth ambiguity by generating multiple 3D pose candidates which can be\nmapped to an identical 2D keypoint. We build a novel diffusion-based framework\nto effectively sample diverse 3D poses from an off-the-shelf 2D detector. By\nconsidering the correlation between human joints by replacing the conventional\ndenoising U-Net with graph convolutional network, our approach accomplishes\nfurther performance improvements. We evaluate our method on the widely adopted\nHuman3.6M and HumanEva-I datasets. Comprehensive experiments are conducted to\nprove the efficacy of the proposed method, and they confirm that our model\noutperforms state-of-the-art multi-hypothesis 3D HPE methods.\n","authors":["Jeongjun Choi","Dongseok Shim","H. Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2212.02796v3.pdf","comment":"Accepted to IROS 2023. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2308.01626v1","updated":"2023-08-03T08:56:56Z","published":"2023-08-03T08:56:56Z","title":"Interleaving GANs with knowledge graphs to support design creativity for\n  book covers","summary":"  An attractive book cover is important for the success of a book. In this\npaper, we apply Generative Adversarial Networks (GANs) to the book covers\ndomain, using different methods for training in order to obtain better\ngenerated images. We interleave GANs with knowledge graphs to alter the input\ntitle to obtain multiple possible options for any given title, which are then\nused as an augmented input to the generator. Finally, we use the discriminator\nobtained during the training phase to select the best images generated with new\ntitles. Our method performed better at generating book covers than previous\nattempts, and the knowledge graph gives better options to the book author or\neditor compared to using GANs alone.\n","authors":["Alexandru Motogna","Adrian Groza"],"pdf_url":"https://arxiv.org/pdf/2308.01626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01622v1","updated":"2023-08-03T08:53:23Z","published":"2023-08-03T08:53:23Z","title":"ReIDTrack: Multi-Object Track and Segmentation Without Motion","summary":"  In recent years, dominant Multi-object tracking (MOT) and segmentation (MOTS)\nmethods mainly follow the tracking-by-detection paradigm. Transformer-based\nend-to-end (E2E) solutions bring some ideas to MOT and MOTS, but they cannot\nachieve a new state-of-the-art (SOTA) performance in major MOT and MOTS\nbenchmarks. Detection and association are two main modules of the\ntracking-by-detection paradigm. Association techniques mainly depend on the\ncombination of motion and appearance information. As deep learning has been\nrecently developed, the performance of the detection and appearance model is\nrapidly improved. These trends made us consider whether we can achieve SOTA\nbased on only high-performance detection and appearance model. Our paper mainly\nfocuses on exploring this direction based on CBNetV2 with Swin-B as a detection\nmodel and MoCo-v2 as a self-supervised appearance model. Motion information and\nIoU mapping were removed during the association. Our method wins 1st place on\nthe MOTS track and wins 2nd on the MOT track in the CVPR2023 WAD workshop. We\nhope our simple and effective method can give some insights to the MOT and MOTS\nresearch community. Source code will be released under this git repository\n","authors":["Kaer Huang","Bingchuan Sun","Feng Chen","Tao Zhang","Jun Xie","Jian Li","Christopher Walter Twombly","Zhepeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01621v1","updated":"2023-08-03T08:50:48Z","published":"2023-08-03T08:50:48Z","title":"A Novel Convolutional Neural Network Architecture with a Continuous\n  Symmetry","summary":"  This paper introduces a new Convolutional Neural Network (ConvNet)\narchitecture inspired by a class of partial differential equations (PDEs)\ncalled quasi-linear hyperbolic systems. With comparable performance on image\nclassification task, it allows for the modification of the weights via a\ncontinuous group of symmetry. This is a significant shift from traditional\nmodels where the architecture and weights are essentially fixed. We wish to\npromote the (internal) symmetry as a new desirable property for a neural\nnetwork, and to draw attention to the PDE perspective in analyzing and\ninterpreting ConvNets in the broader Deep Learning community.\n","authors":["Yao Liu","Hang Shao","Bing Bai"],"pdf_url":"https://arxiv.org/pdf/2308.01621v1.pdf","comment":"Accepted by the 3rd CAAI International Conference on Artificial\n  Intelligence (CICAI), 2023"},{"id":"http://arxiv.org/abs/2308.01618v1","updated":"2023-08-03T08:48:14Z","published":"2023-08-03T08:48:14Z","title":"A Survey on Deep Learning-based Spatio-temporal Action Detection","summary":"  Spatio-temporal action detection (STAD) aims to classify the actions present\nin a video and localize them in space and time. It has become a particularly\nactive area of research in computer vision because of its explosively emerging\nreal-world applications, such as autonomous driving, visual surveillance,\nentertainment, etc. Many efforts have been devoted in recent years to building\na robust and effective framework for STAD. This paper provides a comprehensive\nreview of the state-of-the-art deep learning-based methods for STAD. Firstly, a\ntaxonomy is developed to organize these methods. Next, the linking algorithms,\nwhich aim to associate the frame- or clip-level detection results together to\nform action tubes, are reviewed. Then, the commonly used benchmark datasets and\nevaluation metrics are introduced, and the performance of state-of-the-art\nmodels is compared. At last, this paper is concluded, and a set of potential\nresearch directions of STAD are discussed.\n","authors":["Peng Wang","Fanwei Zeng","Yuntao Qian"],"pdf_url":"https://arxiv.org/pdf/2308.01618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01614v1","updated":"2023-08-03T08:41:39Z","published":"2023-08-03T08:41:39Z","title":"Assessing Systematic Weaknesses of DNNs using Counterfactuals","summary":"  With the advancement of DNNs into safety-critical applications, testing\napproaches for such models have gained more attention. A current direction is\nthe search for and identification of systematic weaknesses that put safety\nassumptions based on average performance values at risk. Such weaknesses can\ntake on the form of (semantically coherent) subsets or areas in the input space\nwhere a DNN performs systematically worse than its expected average. However,\nit is non-trivial to attribute the reason for such observed low performances to\nthe specific semantic features that describe the subset. For instance,\ninhomogeneities within the data w.r.t. other (non-considered) attributes might\ndistort results. However, taking into account all (available) attributes and\ntheir interaction is often computationally highly expensive. Inspired by\ncounterfactual explanations, we propose an effective and computationally cheap\nalgorithm to validate the semantic attribution of existing subsets, i.e., to\ncheck whether the identified attribute is likely to have caused the degraded\nperformance. We demonstrate this approach on an example from the autonomous\ndriving domain using highly annotated simulated data, where we show for a\nsemantic segmentation model that (i) performance differences among the\ndifferent pedestrian assets exist, but (ii) only in some cases is the asset\ntype itself the reason for this reduction in the performance.\n","authors":["Sujan Sai Gannamaneni","Michael Mock","Maram Akila"],"pdf_url":"https://arxiv.org/pdf/2308.01614v1.pdf","comment":"AAAI Spring Symposium 2023"},{"id":"http://arxiv.org/abs/2308.01613v1","updated":"2023-08-03T08:41:37Z","published":"2023-08-03T08:41:37Z","title":"Real-time Light Estimation and Neural Soft Shadows for AR Indoor\n  Scenarios","summary":"  We present a pipeline for realistic embedding of virtual objects into footage\nof indoor scenes with focus on real-time AR applications. Our pipeline consists\nof two main components: A light estimator and a neural soft shadow texture\ngenerator. Our light estimation is based on deep neural nets and determines the\nmain light direction, light color, ambient color and an opacity parameter for\nthe shadow texture. Our neural soft shadow method encodes object-based\nrealistic soft shadows as light direction dependent textures in a small MLP. We\nshow that our pipeline can be used to integrate objects into AR scenes in a new\nlevel of realism in real-time. Our models are small enough to run on current\nmobile devices. We achieve runtimes of 9ms for light estimation and 5ms for\nneural shadows on an iPhone 11 Pro.\n","authors":["Alexander Sommer","Ulrich Schwanecke","Elmar Schömer"],"pdf_url":"https://arxiv.org/pdf/2308.01613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13726v3","updated":"2023-08-03T08:31:53Z","published":"2022-12-28T07:21:05Z","title":"A Clustering-guided Contrastive Fusion for Multi-view Representation\n  Learning","summary":"  The past two decades have seen increasingly rapid advances in the field of\nmulti-view representation learning due to it extracting useful information from\ndiverse domains to facilitate the development of multi-view applications.\nHowever, the community faces two challenges: i) how to learn robust\nrepresentations from a large amount of unlabeled data to against noise or\nincomplete views setting, and ii) how to balance view consistency and\ncomplementary for various downstream tasks. To this end, we utilize a deep\nfusion network to fuse view-specific representations into the view-common\nrepresentation, extracting high-level semantics for obtaining robust\nrepresentation. In addition, we employ a clustering task to guide the fusion\nnetwork to prevent it from leading to trivial solutions. For balancing\nconsistency and complementary, then, we design an asymmetrical contrastive\nstrategy that aligns the view-common representation and each view-specific\nrepresentation. These modules are incorporated into a unified method known as\nCLustering-guided cOntrastiVE fusioN (CLOVEN). We quantitatively and\nqualitatively evaluate the proposed method on five datasets, demonstrating that\nCLOVEN outperforms 11 competitive multi-view learning methods in clustering and\nclassification. In the incomplete view scenario, our proposed method resists\nnoise interference better than those of our competitors. Furthermore, the\nvisualization analysis shows that CLOVEN can preserve the intrinsic structure\nof view-specific representation while also improving the compactness of\nview-commom representation. Our source code will be available soon at\nhttps://github.com/guanzhou-ke/cloven.\n","authors":["Guanzhou Ke","Guoqing Chao","Xiaoli Wang","Chenyang Xu","Yongqi Zhu","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2212.13726v3.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.01604v1","updated":"2023-08-03T08:16:55Z","published":"2023-08-03T08:16:55Z","title":"IndoHerb: Indonesia Medicinal Plants Recognition using Transfer Learning\n  and Deep Learning","summary":"  Herbal plants are nutritious plants that can be used as an alternative to\ntraditional disease healing. In Indonesia there are various types of herbal\nplants. But with the development of the times, the existence of herbal plants\nas traditional medicines began to be forgotten so that not everyone could\nrecognize them. Having the ability to identify herbal plants can have many\npositive impacts. However, there is a problem where identifying plants can take\na long time because it requires in-depth knowledge and careful examination of\nplant criteria. So that the application of computer vision can help identify\nherbal plants. Previously, research had been conducted on the introduction of\nherbal plants from Vietnam using several algorithms, but from these research\nthe accuracy was not high enough. Therefore, this study intends to implement\ntransfer learning from the Convolutional Neural Network (CNN) algorithm to\nclassify types of herbal plants from Indonesia. This research was conducted by\ncollecting image data of herbal plants from Indonesia independently through the\nGoogle Images search engine. After that, it will go through the data\npreprocessing, classification using the transfer learning method from CNN, and\nanalysis will be carried out. The CNN transfer learning models used are\nResNet34, DenseNet121, and VGG11_bn. Based on the test results of the three\nmodels, it was found that DenseNet121 was the model with the highest accuracy,\nwhich was 87.4%. In addition, testing was also carried out using the scratch\nmodel and obtained an accuracy of 43.53%. The Hyperparameter configuration used\nin this test is the ExponentialLR scheduler with a gamma value of 0.9; learning\nrate 0.001; Cross Entropy Loss function; Adam optimizer; and the number of\nepochs is 50. Indonesia Medicinal Plant Dataset can be accessed at the\nfollowing link https://github.com/Salmanim20/indo_medicinal_plant\n","authors":["Muhammad Salman Ikrar Musyaffa","Novanto Yudistira","Muhammad Arif Rahman"],"pdf_url":"https://arxiv.org/pdf/2308.01604v1.pdf","comment":"25 pages, 18 figures"},{"id":"http://arxiv.org/abs/2211.14512v2","updated":"2023-08-03T07:57:11Z","published":"2022-11-26T08:32:28Z","title":"Residual Pattern Learning for Pixel-wise Out-of-Distribution Detection\n  in Semantic Segmentation","summary":"  Semantic segmentation models classify pixels into a set of known\n(``in-distribution'') visual classes. When deployed in an open world, the\nreliability of these models depends on their ability not only to classify\nin-distribution pixels but also to detect out-of-distribution (OoD) pixels.\nHistorically, the poor OoD detection performance of these models has motivated\nthe design of methods based on model re-training using synthetic training\nimages that include OoD visual objects. Although successful, these re-trained\nmethods have two issues: 1) their in-distribution segmentation accuracy may\ndrop during re-training, and 2) their OoD detection accuracy does not\ngeneralise well to new contexts (e.g., country surroundings) outside the\ntraining set (e.g., city surroundings). In this paper, we mitigate these issues\nwith: (i) a new residual pattern learning (RPL) module that assists the\nsegmentation model to detect OoD pixels without affecting the inlier\nsegmentation performance; and (ii) a novel context-robust contrastive learning\n(CoroCL) that enforces RPL to robustly detect OoD pixels among various\ncontexts. Our approach improves by around 10\\% FPR and 7\\% AuPRC the previous\nstate-of-the-art in Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly\ndatasets. Our code is available at: https://github.com/yyliu01/RPL.\n","authors":["Yuyuan Liu","Choubo Ding","Yu Tian","Guansong Pang","Vasileios Belagiannis","Ian Reid","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2211.14512v2.pdf","comment":"16 pages, 11 figures and it is a preprint version"},{"id":"http://arxiv.org/abs/2308.01594v1","updated":"2023-08-03T07:57:02Z","published":"2023-08-03T07:57:02Z","title":"Reference-Free Isotropic 3D EM Reconstruction using Diffusion Models","summary":"  Electron microscopy (EM) images exhibit anisotropic axial resolution due to\nthe characteristics inherent to the imaging modality, presenting challenges in\nanalysis and downstream tasks.In this paper, we propose a diffusion-model-based\nframework that overcomes the limitations of requiring reference data or prior\nknowledge about the degradation process. Our approach utilizes 2D diffusion\nmodels to consistently reconstruct 3D volumes and is well-suited for highly\ndownsampled data. Extensive experiments conducted on two public datasets\ndemonstrate the robustness and superiority of leveraging the generative prior\ncompared to supervised learning methods. Additionally, we demonstrate our\nmethod's feasibility for self-supervised reconstruction, which can restore a\nsingle anisotropic volume without any training data.\n","authors":["Kyungryun Lee","Won-Ki Jeong"],"pdf_url":"https://arxiv.org/pdf/2308.01594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.00874v2","updated":"2023-08-03T07:47:45Z","published":"2022-08-01T14:05:23Z","title":"S$^2$Contact: Graph-based Network for 3D Hand-Object Contact Estimation\n  with Semi-Supervised Learning","summary":"  Despite the recent efforts in accurate 3D annotations in hand and object\ndatasets, there still exist gaps in 3D hand and object reconstructions.\nExisting works leverage contact maps to refine inaccurate hand-object pose\nestimations and generate grasps given object models. However, they require\nexplicit 3D supervision which is seldom available and therefore, are limited to\nconstrained settings, e.g., where thermal cameras observe residual heat left on\nmanipulated objects. In this paper, we propose a novel semi-supervised\nframework that allows us to learn contact from monocular images. Specifically,\nwe leverage visual and geometric consistency constraints in large-scale\ndatasets for generating pseudo-labels in semi-supervised learning and propose\nan efficient graph-based network to infer contact. Our semi-supervised learning\nframework achieves a favourable improvement over the existing supervised\nlearning methods trained on data with `limited' annotations. Notably, our\nproposed model is able to achieve superior results with less than half the\nnetwork parameters and memory access cost when compared with the commonly-used\nPointNet-based approach. We show benefits from using a contact map that rules\nhand-object interactions to produce more accurate reconstructions. We further\ndemonstrate that training with pseudo-labels can extend contact map estimations\nto out-of-domain objects and generalise better across multiple datasets.\n","authors":["Tze Ho Elden Tse","Zhongqun Zhang","Kwang In Kim","Ales Leonardis","Feng Zheng","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2208.00874v2.pdf","comment":"Accepted to ECCV 2022"},{"id":"http://arxiv.org/abs/2302.13869v3","updated":"2023-08-03T07:46:03Z","published":"2023-02-27T15:17:01Z","title":"EDMAE: An Efficient Decoupled Masked Autoencoder for Standard View\n  Identification in Pediatric Echocardiography","summary":"  This paper introduces the Efficient Decoupled Masked Autoencoder (EDMAE), a\nnovel self-supervised method for recognizing standard views in pediatric\nechocardiography. EDMAE introduces a new proxy task based on the\nencoder-decoder structure. The EDMAE encoder is composed of a teacher and a\nstudent encoder. The teacher encoder extracts the potential representation of\nthe masked image blocks, while the student encoder extracts the potential\nrepresentation of the visible image blocks. The loss is calculated between the\nfeature maps output by the two encoders to ensure consistency in the latent\nrepresentations they extract. EDMAE uses pure convolution operations instead of\nthe ViT structure in the MAE encoder. This improves training efficiency and\nconvergence speed. EDMAE is pre-trained on a large-scale private dataset of\npediatric echocardiography using self-supervised learning, and then fine-tuned\nfor standard view recognition. The proposed method achieves high classification\naccuracy in 27 standard views of pediatric echocardiography. To further verify\nthe effectiveness of the proposed method, the authors perform another\ndownstream task of cardiac ultrasound segmentation on the public dataset CAMUS.\nThe experimental results demonstrate that the proposed method outperforms some\npopular supervised and recent self-supervised methods, and is more competitive\non different downstream tasks.\n","authors":["Yiman Liu","Xiaoxiang Han","Tongtong Liang","Bin Dong","Jiajun Yuan","Menghan Hu","Qiaohong Liu","Jiangang Chen","Qingli Li","Yuqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13869v3.pdf","comment":"15 pages, 5 figures, 8 tables, Published in Biomedical Signal\n  Processing and Control"},{"id":"http://arxiv.org/abs/2308.01587v1","updated":"2023-08-03T07:45:53Z","published":"2023-08-03T07:45:53Z","title":"Consistency Regularization for Generalizable Source-free Domain\n  Adaptation","summary":"  Source-free domain adaptation (SFDA) aims to adapt a well-trained source\nmodel to an unlabelled target domain without accessing the source dataset,\nmaking it applicable in a variety of real-world scenarios. Existing SFDA\nmethods ONLY assess their adapted models on the target training set, neglecting\nthe data from unseen but identically distributed testing sets. This oversight\nleads to overfitting issues and constrains the model's generalization ability.\nIn this paper, we propose a consistency regularization framework to develop a\nmore generalizable SFDA method, which simultaneously boosts model performance\non both target training and testing datasets. Our method leverages soft\npseudo-labels generated from weakly augmented images to supervise strongly\naugmented images, facilitating the model training process and enhancing the\ngeneralization ability of the adapted model. To leverage more potentially\nuseful supervision, we present a sampling-based pseudo-label selection\nstrategy, taking samples with severer domain shift into consideration.\nMoreover, global-oriented calibration methods are introduced to exploit global\nclass distribution and feature cluster information, further improving the\nadaptation process. Extensive experiments demonstrate our method achieves\nstate-of-the-art performance on several SFDA benchmarks, and exhibits\nrobustness on unseen testing datasets.\n","authors":["Longxiang Tang","Kai Li","Chunming He","Yulun Zhang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2308.01587v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.16739v3","updated":"2023-08-03T07:39:21Z","published":"2023-03-29T14:42:30Z","title":"Active Implicit Object Reconstruction using Uncertainty-guided\n  Next-Best-View Optimization","summary":"  Actively planning sensor views during object reconstruction is crucial for\nautonomous mobile robots. An effective method should be able to strike a\nbalance between accuracy and efficiency. In this paper, we propose a seamless\nintegration of the emerging implicit representation with the active\nreconstruction task. We build an implicit occupancy field as our geometry\nproxy. While training, the prior object bounding box is utilized as auxiliary\ninformation to generate clean and detailed reconstructions. To evaluate view\nuncertainty, we employ a sampling-based approach that directly extracts entropy\nfrom the reconstructed occupancy probability field as our measure of view\ninformation gain. This eliminates the need for additional uncertainty maps or\nlearning. Unlike previous methods that compare view uncertainty within a finite\nset of candidates, we aim to find the next-best-view (NBV) on a continuous\nmanifold. Leveraging the differentiability of the implicit representation, the\nNBV can be optimized directly by maximizing the view uncertainty using gradient\ndescent. It significantly enhances the method's adaptability to different\nscenarios. Simulation and real-world experiments demonstrate that our approach\neffectively improves reconstruction accuracy and efficiency of view planning in\nactive reconstruction tasks. The proposed system will open source at\nhttps://github.com/HITSZ-NRSL/ActiveImplicitRecon.git.\n","authors":["Dongyu Yan","Jianheng Liu","Fengyu Quan","Haoyao Chen","Mengmeng Fu"],"pdf_url":"https://arxiv.org/pdf/2303.16739v3.pdf","comment":"8 pages, 11 figures, Submitted to IEEE Robotics and Automation\n  Letters (RA-L)"},{"id":"http://arxiv.org/abs/2212.06458v3","updated":"2023-08-03T07:32:30Z","published":"2022-12-13T10:04:01Z","title":"HS-Diffusion: Semantic-Mixing Diffusion for Head Swapping","summary":"  Image-based head swapping task aims to stitch a source head to another source\nbody flawlessly. This seldom-studied task faces two major challenges: 1)\nPreserving the head and body from various sources while generating a seamless\ntransition region. 2) No paired head swapping dataset and benchmark so far. In\nthis paper, we propose a semantic-mixing diffusion model for head swapping\n(HS-Diffusion) which consists of a latent diffusion model (LDM) and a semantic\nlayout generator. We blend the semantic layouts of source head and source body,\nand then inpaint the transition region by the semantic layout generator,\nachieving a coarse-grained head swapping. Semantic-mixing LDM can further\nimplement a fine-grained head swapping with the inpainted layout as condition\nby a progressive fusion process, while preserving head and body with\nhigh-quality reconstruction. To this end, we propose a semantic calibration\nstrategy for natural inpainting and a neck alignment for geometric realism.\nImportantly, we construct a new image-based head swapping benchmark and design\ntwo tailor-designed metrics (Mask-FID and Focal-FID). Extensive experiments\ndemonstrate the superiority of our framework. The code will be available:\nhttps://github.com/qinghew/HS-Diffusion.\n","authors":["Qinghe Wang","Lijie Liu","Miao Hua","Pengfei Zhu","Wangmeng Zuo","Qinghua Hu","Huchuan Lu","Bing Cao"],"pdf_url":"https://arxiv.org/pdf/2212.06458v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01568v1","updated":"2023-08-03T07:16:18Z","published":"2023-08-03T07:16:18Z","title":"MVFlow: Deep Optical Flow Estimation of Compressed Videos with Motion\n  Vector Prior","summary":"  In recent years, many deep learning-based methods have been proposed to\ntackle the problem of optical flow estimation and achieved promising results.\nHowever, they hardly consider that most videos are compressed and thus ignore\nthe pre-computed information in compressed video streams. Motion vectors, one\nof the compression information, record the motion of the video frames. They can\nbe directly extracted from the compression code stream without computational\ncost and serve as a solid prior for optical flow estimation. Therefore, we\npropose an optical flow model, MVFlow, which uses motion vectors to improve the\nspeed and accuracy of optical flow estimation for compressed videos. In detail,\nMVFlow includes a key Motion-Vector Converting Module, which ensures that the\nmotion vectors can be transformed into the same domain of optical flow and then\nbe utilized fully by the flow estimation module. Meanwhile, we construct four\noptical flow datasets for compressed videos containing frames and motion\nvectors in pairs. The experimental results demonstrate the superiority of our\nproposed MVFlow, which can reduce the AEPE by 1.09 compared to existing models\nor save 52% time to achieve similar accuracy to existing models.\n","authors":["Shili Zhou","Xuhao Jiang","Weimin Tan","Ruian He","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2308.01568v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2201.01615v2","updated":"2023-08-03T06:13:54Z","published":"2022-01-05T13:51:20Z","title":"Lawin Transformer: Improving New-Era Vision Backbones with Multi-Scale\n  Representations for Semantic Segmentation","summary":"  The multi-level aggregation (MLA) module has emerged as a critical component\nfor advancing new-era vision back-bones in semantic segmentation. In this\npaper, we propose Lawin (large window) Transformer, a novel MLA architecture\nthat creatively utilizes multi-scale feature maps from the vision backbone. At\nthe core of Lawin Transformer is the Lawin attention, a newly designed window\nattention mechanism capable of querying much larger context windows than local\nwindows. We focus on studying the efficient and simplistic application of the\nlarge-window paradigm, allowing for flexible regulation of the ratio of large\ncontext to query and capturing multi-scale representations. We validate the\neffectiveness of Lawin Transformer on Cityscapes and ADE20K, consistently\ndemonstrating great superiority to widely-used MLA modules when combined with\nnew-era vision backbones. The code is available at\nhttps://github.com/yan-hao-tian/lawin.\n","authors":["Haotian Yan","Chuang Zhang","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2201.01615v2.pdf","comment":"Accepted by https://sites.google.com/view/t4v-cvpr23"},{"id":"http://arxiv.org/abs/2307.07928v4","updated":"2023-08-03T06:05:02Z","published":"2023-07-16T02:44:19Z","title":"Reinforced Disentanglement for Face Swapping without Skip Connection","summary":"  The SOTA face swap models still suffer the problem of either target identity\n(i.e., shape) being leaked or the target non-identity attributes (i.e.,\nbackground, hair) failing to be fully preserved in the final results. We show\nthat this insufficient disentanglement is caused by two flawed designs that\nwere commonly adopted in prior models: (1) counting on only one compressed\nencoder to represent both the semantic-level non-identity facial\nattributes(i.e., pose) and the pixel-level non-facial region details, which is\ncontradictory to satisfy at the same time; (2) highly relying on long\nskip-connections between the encoder and the final generator, leaking a certain\namount of target face identity into the result. To fix them, we introduce a new\nface swap framework called 'WSC-swap' that gets rid of skip connections and\nuses two target encoders to respectively capture the pixel-level non-facial\nregion attributes and the semantic non-identity attributes in the face region.\nTo further reinforce the disentanglement learning for the target encoder, we\nemploy both identity removal loss via adversarial training (i.e., GAN) and the\nnon-identity preservation loss via prior 3DMM models like [11]. Extensive\nexperiments on both FaceForensics++ and CelebA-HQ show that our results\nsignificantly outperform previous works on a rich set of metrics, including one\nnovel metric for measuring identity consistency that was completely neglected\nbefore.\n","authors":["Xiaohang Ren","Xingyu Chen","Pengfei Yao","Heung-Yeung Shum","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07928v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.01547v1","updated":"2023-08-03T06:02:02Z","published":"2023-08-03T06:02:02Z","title":"Get the Best of Both Worlds: Improving Accuracy and Transferability by\n  Grassmann Class Representation","summary":"  We generalize the class vectors found in neural networks to linear subspaces\n(i.e.~points in the Grassmann manifold) and show that the Grassmann Class\nRepresentation (GCR) enables the simultaneous improvement in accuracy and\nfeature transferability. In GCR, each class is a subspace and the logit is\ndefined as the norm of the projection of a feature onto the class subspace. We\nintegrate Riemannian SGD into deep learning frameworks such that class\nsubspaces in a Grassmannian are jointly optimized with the rest model\nparameters. Compared to the vector form, the representative capability of\nsubspaces is more powerful. We show that on ImageNet-1K, the top-1 error of\nResNet50-D, ResNeXt50, Swin-T and Deit3-S are reduced by 5.6%, 4.5%, 3.0% and\n3.5%, respectively. Subspaces also provide freedom for features to vary and we\nobserved that the intra-class feature variability grows when the subspace\ndimension increases. Consequently, we found the quality of GCR features is\nbetter for downstream tasks. For ResNet50-D, the average linear transfer\naccuracy across 6 datasets improves from 77.98% to 79.70% compared to the\nstrong baseline of vanilla softmax. For Swin-T, it improves from 81.5% to 83.4%\nand for Deit3, it improves from 73.8% to 81.4%. With these encouraging results,\nwe believe that more applications could benefit from the Grassmann class\nrepresentation. Code is released at https://github.com/innerlee/GCR.\n","authors":["Haoqi Wang","Zhizhong Li","Wayne Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01547v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.01544v1","updated":"2023-08-03T05:27:12Z","published":"2023-08-03T05:27:12Z","title":"Multimodal Neurons in Pretrained Text-Only Transformers","summary":"  Language models demonstrate remarkable capacity to generalize representations\nlearned in one modality to downstream tasks in other modalities. Can we trace\nthis ability to individual neurons? We study the case where a frozen text\ntransformer is augmented with vision using a self-supervised visual encoder and\na single linear projection learned on an image-to-text task. Outputs of the\nprojection layer are not immediately decodable into language describing image\ncontent; instead, we find that translation between modalities occurs deeper\nwithin the transformer. We introduce a procedure for identifying \"multimodal\nneurons\" that convert visual representations into corresponding text, and\ndecoding the concepts they inject into the model's residual stream. In a series\nof experiments, we show that multimodal neurons operate on specific visual\nconcepts across inputs, and have a systematic causal effect on image\ncaptioning.\n","authors":["Sarah Schwettmann","Neil Chowdhury","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2308.01544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01541v1","updated":"2023-08-03T05:10:58Z","published":"2023-08-03T05:10:58Z","title":"DMDC: Dynamic-mask-based dual camera design for snapshot Hyperspectral\n  Imaging","summary":"  Deep learning methods are developing rapidly in coded aperture snapshot\nspectral imaging (CASSI). The number of parameters and FLOPs of existing\nstate-of-the-art methods (SOTA) continues to increase, but the reconstruction\naccuracy improves slowly. Current methods still face two problems: 1) The\nperformance of the spatial light modulator (SLM) is not fully developed due to\nthe limitation of fixed Mask coding. 2) The single input limits the network\nperformance. In this paper we present a dynamic-mask-based dual camera system,\nwhich consists of an RGB camera and a CASSI system running in parallel. First,\nthe system learns the spatial feature distribution of the scene based on the\nRGB images, then instructs the SLM to encode each scene, and finally sends both\nRGB and CASSI images to the network for reconstruction. We further designed the\nDMDC-net, which consists of two separate networks, a small-scale CNN-based\ndynamic mask network for dynamic adjustment of the mask and a multimodal\nreconstruction network for reconstruction using RGB and CASSI measurements.\nExtensive experiments on multiple datasets show that our method achieves more\nthan 9 dB improvement in PSNR over the SOTA.\n(https://github.com/caizeyu1992/DMDC)\n","authors":["Zeyu Cai","Chengqian Jin","Feipeng Da"],"pdf_url":"https://arxiv.org/pdf/2308.01541v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.01536v1","updated":"2023-08-03T04:36:48Z","published":"2023-08-03T04:36:48Z","title":"MFIM: Megapixel Facial Identity Manipulation","summary":"  Face swapping is a task that changes a facial identity of a given image to\nthat of another person. In this work, we propose a novel face-swapping\nframework called Megapixel Facial Identity Manipulation (MFIM). The\nface-swapping model should achieve two goals. First, it should be able to\ngenerate a high-quality image. We argue that a model which is proficient in\ngenerating a megapixel image can achieve this goal. However, generating a\nmegapixel image is generally difficult without careful model design. Therefore,\nour model exploits pretrained StyleGAN in the manner of GAN-inversion to\neffectively generate a megapixel image. Second, it should be able to\neffectively transform the identity of a given image. Specifically, it should be\nable to actively transform ID attributes (e.g., face shape and eyes) of a given\nimage into those of another person, while preserving ID-irrelevant attributes\n(e.g., pose and expression). To achieve this goal, we exploit 3DMM that can\ncapture various facial attributes. Specifically, we explicitly supervise our\nmodel to generate a face-swapped image with the desirable attributes using\n3DMM. We show that our model achieves state-of-the-art performance through\nextensive experiments. Furthermore, we propose a new operation called ID\nmixing, which creates a new identity by semantically mixing the identities of\nseveral people. It allows the user to customize the new identity.\n","authors":["Sanghyeon Na"],"pdf_url":"https://arxiv.org/pdf/2308.01536v1.pdf","comment":"ECCV 2022 accepted"},{"id":"http://arxiv.org/abs/2208.11435v3","updated":"2023-08-03T04:28:15Z","published":"2022-08-24T11:01:47Z","title":"Bidirectional Contrastive Split Learning for Visual Question Answering","summary":"  Visual Question Answering (VQA) based on multi-modal data facilitates\nreal-life applications such as home robots and medical diagnoses. One\nsignificant challenge is to devise a robust decentralized learning framework\nfor various client models where centralized data collection is refrained due to\nconfidentiality concerns. This work aims to tackle privacy-preserving VQA by\ndecoupling a multi-modal model into representation modules and a contrastive\nmodule and leveraging inter-module gradients sharing and inter-client weight\nsharing. To this end, we propose Bidirectional Contrastive Split Learning\n(BiCSL) to train a global multi-modal model on the entire data distribution of\ndecentralized clients. We employ the contrastive loss that enables a more\nefficient self-supervised learning of decentralized modules. Comprehensive\nexperiments are conducted on the VQA-v2 dataset based on five SOTA VQA models,\ndemonstrating the effectiveness of the proposed method. Furthermore, we inspect\nBiCSL's robustness against a dual-key backdoor attack on VQA. Consequently,\nBiCSL shows much better robustness to the multi-modal adversarial attack\ncompared to the centralized learning method, which provides a promising\napproach to decentralized multi-modal learning.\n","authors":["Yuwei Sun","Hideya Ochiai"],"pdf_url":"https://arxiv.org/pdf/2208.11435v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01532v1","updated":"2023-08-03T04:17:25Z","published":"2023-08-03T04:17:25Z","title":"Multimodal Adaptation of CLIP for Few-Shot Action Recognition","summary":"  Applying large-scale pre-trained visual models like CLIP to few-shot action\nrecognition tasks can benefit performance and efficiency. Utilizing the\n\"pre-training, fine-tuning\" paradigm makes it possible to avoid training a\nnetwork from scratch, which can be time-consuming and resource-intensive.\nHowever, this method has two drawbacks. First, limited labeled samples for\nfew-shot action recognition necessitate minimizing the number of tunable\nparameters to mitigate over-fitting, also leading to inadequate fine-tuning\nthat increases resource consumption and may disrupt the generalized\nrepresentation of models. Second, the video's extra-temporal dimension\nchallenges few-shot recognition's effective temporal modeling, while\npre-trained visual models are usually image models. This paper proposes a novel\nmethod called Multimodal Adaptation of CLIP (MA-CLIP) to address these issues.\nIt adapts CLIP for few-shot action recognition by adding lightweight adapters,\nwhich can minimize the number of learnable parameters and enable the model to\ntransfer across different tasks quickly. The adapters we design can combine\ninformation from video-text multimodal sources for task-oriented spatiotemporal\nmodeling, which is fast, efficient, and has low training costs. Additionally,\nbased on the attention mechanism, we design a text-guided prototype\nconstruction module that can fully utilize video-text information to enhance\nthe representation of video prototypes. Our MA-CLIP is plug-and-play, which can\nbe used in any different few-shot action recognition temporal alignment metric.\n","authors":["Jiazheng Xing","Mengmeng Wang","Xiaojun Hou","Guang Dai","Jingdong Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.01532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01526v1","updated":"2023-08-03T04:04:40Z","published":"2023-08-03T04:04:40Z","title":"Data Augmentation for Human Behavior Analysis in Multi-Person\n  Conversations","summary":"  In this paper, we present the solution of our team HFUT-VUT for the\nMultiMediate Grand Challenge 2023 at ACM Multimedia 2023. The solution covers\nthree sub-challenges: bodily behavior recognition, eye contact detection, and\nnext speaker prediction. We select Swin Transformer as the baseline and exploit\ndata augmentation strategies to address the above three tasks. Specifically, we\ncrop the raw video to remove the noise from other parts. At the same time, we\nutilize data augmentation to improve the generalization of the model. As a\nresult, our solution achieves the best results of 0.6262 for bodily behavior\nrecognition in terms of mean average precision and the accuracy of 0.7771 for\neye contact detection on the corresponding test set. In addition, our approach\nalso achieves comparable results of 0.5281 for the next speaker prediction in\nterms of unweighted average recall.\n","authors":["Kun Li","Dan Guo","Guoliang Chen","Feiyang Liu","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01526v1.pdf","comment":"Solutions of HFUT-VUT Team at the ACM MM 2023 Grand Challenge\n  (MultiMediate: Multi-modal Behaviour Analysis for Artificial Mediation).\n  Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.01525v1","updated":"2023-08-03T04:04:03Z","published":"2023-08-03T04:04:03Z","title":"VisAlign: Dataset for Measuring the Degree of Alignment between AI and\n  Humans in Visual Perception","summary":"  AI alignment refers to models acting towards human-intended goals,\npreferences, or ethical principles. Given that most large-scale deep learning\nmodels act as black boxes and cannot be manually controlled, analyzing the\nsimilarity between models and humans can be a proxy measure for ensuring AI\nsafety. In this paper, we focus on the models' visual perception alignment with\nhumans, further referred to as AI-human visual alignment. Specifically, we\npropose a new dataset for measuring AI-human visual alignment in terms of image\nclassification, a fundamental task in machine perception. In order to evaluate\nAI-human visual alignment, a dataset should encompass samples with various\nscenarios that may arise in the real world and have gold human perception\nlabels. Our dataset consists of three groups of samples, namely Must-Act (i.e.,\nMust-Classify), Must-Abstain, and Uncertain, based on the quantity and clarity\nof visual information in an image and further divided into eight categories.\nAll samples have a gold human perception label; even Uncertain (severely\nblurry) sample labels were obtained via crowd-sourcing. The validity of our\ndataset is verified by sampling theory, statistical theories related to survey\ndesign, and experts in the related fields. Using our dataset, we analyze the\nvisual alignment and reliability of five popular visual perception models and\nseven abstention methods. Our code and data is available at\n\\url{https://github.com/jiyounglee-0523/VisAlign}.\n","authors":["Jiyoung Lee","Seungho Kim","Seunghyun Won","Joonseok Lee","Marzyeh Ghassemi","James Thorne","Jaeseok Choi","O-Kil Kwon","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2308.01525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01521v1","updated":"2023-08-03T03:50:49Z","published":"2023-08-03T03:50:49Z","title":"PPI-NET: End-to-End Parametric Primitive Inference","summary":"  In engineering applications, line, circle, arc, and point are collectively\nreferred to as primitives, and they play a crucial role in path planning,\nsimulation analysis, and manufacturing. When designing CAD models, engineers\ntypically start by sketching the model's orthographic view on paper or a\nwhiteboard and then translate the design intent into a CAD program. Although\nthis design method is powerful, it often involves challenging and repetitive\ntasks, requiring engineers to perform numerous similar operations in each\ndesign. To address this conversion process, we propose an efficient and\naccurate end-to-end method that avoids the inefficiency and error accumulation\nissues associated with using auto-regressive models to infer parametric\nprimitives from hand-drawn sketch images. Since our model samples match the\nrepresentation format of standard CAD software, they can be imported into CAD\nsoftware for solving, editing, and applied to downstream design tasks.\n","authors":["Liang Wang","Xiaogang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01521v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2203.01305 by other authors"},{"id":"http://arxiv.org/abs/2212.08583v2","updated":"2023-08-03T03:46:30Z","published":"2022-12-16T17:02:55Z","title":"Semi-Siamese Network for Robust Change Detection Across Different\n  Domains with Applications to 3D Printing","summary":"  Automatic defect detection for 3D printing processes, which shares many\ncharacteristics with change detection problems, is a vital step for quality\ncontrol of 3D printed products. However, there are some critical challenges in\nthe current state of practice. First, existing methods for computer\nvision-based process monitoring typically work well only under specific camera\nviewpoints and lighting situations, requiring expensive pre-processing,\nalignment, and camera setups. Second, many defect detection techniques are\nspecific to pre-defined defect patterns and/or print schematics. In this work,\nwe approach the defect detection problem using a novel Semi-Siamese deep\nlearning model that directly compares a reference schematic of the desired\nprint and a camera image of the achieved print. The model then solves an image\nsegmentation problem, precisely identifying the locations of defects of\ndifferent types with respect to the reference schematic. Our model is designed\nto enable comparison of heterogeneous images from different domains while being\nrobust against perturbations in the imaging setup such as different camera\nangles and illumination. Crucially, we show that our simple architecture, which\nis easy to pre-train for enhanced performance on new datasets, outperforms more\ncomplex state-of-the-art approaches based on generative adversarial networks\nand transformers. Using our model, defect localization predictions can be made\nin less than half a second per layer using a standard MacBook Pro while\nachieving an F1-score of more than 0.9, demonstrating the efficacy of using our\nmethod for in-situ defect detection in 3D printing.\n","authors":["Yushuo Niu","Ethan Chadwick","Anson W. K. Ma","Qian Yang"],"pdf_url":"https://arxiv.org/pdf/2212.08583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01520v1","updated":"2023-08-03T03:37:13Z","published":"2023-08-03T03:37:13Z","title":"Contrastive Multi-FaceForensics: An End-to-end Bi-grained Contrastive\n  Learning Approach for Multi-face Forgery Detection","summary":"  DeepFakes have raised serious societal concerns, leading to a great surge in\ndetection-based forensics methods in recent years. Face forgery recognition is\nthe conventional detection method that usually follows a two-phase pipeline: it\nextracts the face first and then determines its authenticity by classification.\nSince DeepFakes in the wild usually contain multiple faces, using face forgery\ndetection methods is merely practical as they have to process faces in a\nsequel, i.e., only one face is processed at the same time. One straightforward\nway to address this issue is to integrate face extraction and forgery detection\nin an end-to-end fashion by adapting advanced object detection architectures.\nHowever, as these object detection architectures are designed to capture the\nsemantic information of different object categories rather than the subtle\nforgery traces among the faces, the direct adaptation is far from optimal. In\nthis paper, we describe a new end-to-end framework, Contrastive\nMulti-FaceForensics (COMICS), to enhance multi-face forgery detection. The core\nof the proposed framework is a novel bi-grained contrastive learning approach\nthat explores effective face forgery traces at both the coarse- and\nfine-grained levels. Specifically, the coarse-grained level contrastive\nlearning captures the discriminative features among positive and negative\nproposal pairs in multiple scales with the instruction of the proposal\ngenerator, and the fine-grained level contrastive learning captures the\npixel-wise discrepancy between the forged and original areas of the same face\nand the pixel-wise content inconsistency between different faces. Extensive\nexperiments on the OpenForensics dataset demonstrate our method outperforms\nother counterparts by a large margin (~18.5%) and shows great potential for\nintegration into various architectures.\n","authors":["Cong Zhang","Honggang Qi","Yuezun Li","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2308.01520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v2","updated":"2023-08-03T03:23:25Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n  A Comprehensive Survey","summary":"  Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v2.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2307.13539v2","updated":"2023-08-03T03:22:48Z","published":"2023-07-25T14:40:11Z","title":"Model Calibration in Dense Classification with Adaptive Label\n  Perturbation","summary":"  For safety-related applications, it is crucial to produce trustworthy deep\nneural networks whose prediction is associated with confidence that can\nrepresent the likelihood of correctness for subsequent decision-making.\nExisting dense binary classification models are prone to being over-confident.\nTo improve model calibration, we propose Adaptive Stochastic Label Perturbation\n(ASLP) which learns a unique label perturbation level for each training image.\nASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,\nwhich unifies label perturbation processes including stochastic approaches\n(like DisturbLabel), and label smoothing, to correct calibration while\nmaintaining classification rates. ASLP follows Maximum Entropy Inference of\nclassic statistical mechanics to maximise prediction entropy with respect to\nmissing information. It performs this while: (1) preserving classification\naccuracy on known data as a conservative solution, or (2) specifically improves\nmodel calibration degree by minimising the gap between the prediction accuracy\nand expected confidence of the target training label. Extensive results\ndemonstrate that ASLP can significantly improve calibration degrees of dense\nbinary classification models on both in-distribution and out-of-distribution\ndata. The code is available on https://github.com/Carlisle-Liu/ASLP.\n","authors":["Jiawei Liu","Changkun Ye","Shan Wang","Ruikai Cui","Jing Zhang","Kaihao Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.13539v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.01006v2","updated":"2023-08-03T03:10:46Z","published":"2023-08-02T08:29:44Z","title":"FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of\n  Autonomous Driving","summary":"  Building a multi-modality multi-task neural network toward accurate and\nrobust performance is a de-facto standard in perception task of autonomous\ndriving. However, leveraging such data from multiple sensors to jointly\noptimize the prediction and planning tasks remains largely unexplored. In this\npaper, we present FusionAD, to the best of our knowledge, the first unified\nframework that fuse the information from two most critical sensors, camera and\nLiDAR, goes beyond perception task. Concretely, we first build a transformer\nbased multi-modality fusion network to effectively produce fusion based\nfeatures. In constrast to camera-based end-to-end method UniAD, we then\nestablish a fusion aided modality-aware prediction and status-aware planning\nmodules, dubbed FMSPnP that take advantages of multi-modality features. We\nconduct extensive experiments on commonly used benchmark nuScenes dataset, our\nFusionAD achieves state-of-the-art performance and surpassing baselines on\naverage 15% on perception tasks like detection and tracking, 10% on occupancy\nprediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score\nand reduces the collision rate from 0.31% to only 0.12%.\n","authors":["Tengju Ye","Wei Jing","Chunyong Hu","Shikun Huang","Lingping Gao","Fangzhen Li","Jingke Wang","Ke Guo","Wencong Xiao","Weibo Mao","Hang Zheng","Kun Li","Junbo Chen","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15989v2","updated":"2023-08-03T02:59:21Z","published":"2023-07-29T13:49:48Z","title":"Freespace Optical Flow Modeling for Automated Driving","summary":"  Optical flow and disparity are two informative visual features for autonomous\ndriving perception. They have been used for a variety of applications, such as\nobstacle and lane detection. The concept of \"U-V-Disparity\" has been widely\nexplored in the literature, while its counterpart in optical flow has received\nrelatively little attention. Traditional motion analysis algorithms estimate\noptical flow by matching correspondences between two successive video frames,\nwhich limits the full utilization of environmental information and geometric\nconstraints. Therefore, we propose a novel strategy to model optical flow in\nthe collision-free space (also referred to as drivable area or simply\nfreespace) for intelligent vehicles, with the full utilization of geometry\ninformation in a 3D driving environment. We provide explicit representations of\noptical flow and deduce the quadratic relationship between the optical flow\ncomponent and the vertical coordinate. Through extensive experiments on several\npublic datasets, we demonstrate the high accuracy and robustness of our model.\nAdditionally, our proposed freespace optical flow model boasts a diverse array\nof applications within the realm of automated driving, providing a geometric\nconstraint in freespace detection, vehicle localization, and more. We have made\nour source code publicly available at https://mias.group/FSOF.\n","authors":["Yi Feng","Ruge Zhang","Jiayuan Du","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2307.15989v2.pdf","comment":"This article has been accepted by IEEE/ASME Transactions on\n  Mechatronics (T-Mech)"},{"id":"http://arxiv.org/abs/2308.01508v1","updated":"2023-08-03T02:34:01Z","published":"2023-08-03T02:34:01Z","title":"Circumventing Concept Erasure Methods For Text-to-Image Generative\n  Models","summary":"  Text-to-image generative models can produce photo-realistic images for an\nextremely broad range of concepts, and their usage has proliferated widely\namong the general public. On the flip side, these models have numerous\ndrawbacks, including their potential to generate images featuring sexually\nexplicit content, mirror artistic styles without permission, or even\nhallucinate (or deepfake) the likenesses of celebrities. Consequently, various\nmethods have been proposed in order to \"erase\" sensitive concepts from\ntext-to-image models. In this work, we examine five recently proposed concept\nerasure methods, and show that targeted concepts are not fully excised from any\nof these methods. Specifically, we leverage the existence of special learned\nword embeddings that can retrieve \"erased\" concepts from the sanitized models\nwith no alterations to their weights. Our results highlight the brittleness of\npost hoc concept erasure methods, and call into question their use in the\nalgorithmic toolkit for AI safety.\n","authors":["Minh Pham","Kelly O. Marshall","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2308.01508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06388v3","updated":"2023-08-03T02:17:52Z","published":"2023-03-11T11:42:01Z","title":"FAC: 3D Representation Learning via Foreground Aware Feature Contrast","summary":"  Contrastive learning has recently demonstrated great potential for\nunsupervised pre-training in 3D scene understanding tasks. However, most\nexisting work randomly selects point features as anchors while building\ncontrast, leading to a clear bias toward background points that often dominate\nin 3D scenes. Also, object awareness and foreground-to-background\ndiscrimination are neglected, making contrastive learning less effective. To\ntackle these issues, we propose a general foreground-aware feature contrast\n(FAC) framework to learn more effective point cloud representations in\npre-training. FAC consists of two novel contrast designs to construct more\neffective and informative contrast pairs. The first is building positive pairs\nwithin the same foreground segment where points tend to have the same\nsemantics. The second is that we prevent over-discrimination between 3D\nsegments/objects and encourage foreground-to-background distinctions at the\nsegment level with adaptive feature learning in a Siamese correspondence\nnetwork, which adaptively learns feature correlations within and across point\ncloud views effectively. Visualization with point activation maps shows that\nour contrast pairs capture clear correspondences among foreground regions\nduring pre-training. Quantitative experiments also show that FAC achieves\nsuperior knowledge transfer and data efficiency in various downstream 3D\nsemantic segmentation and object detection tasks.\n","authors":["Kangcheng Liu","Aoran Xiao","Xiaoqin Zhang","Shijian Lu","Ling Shao"],"pdf_url":"https://arxiv.org/pdf/2303.06388v3.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition 2023\n  (CVPR 2023), 11 pages, the work is mainly supported by the Natural Science\n  Foundation Project of Fujian Province (2020J01826)"},{"id":"http://arxiv.org/abs/2308.01239v2","updated":"2023-08-03T02:05:44Z","published":"2023-08-02T15:54:00Z","title":"CMUNeXt: An Efficient Medical Image Segmentation Network based on Large\n  Kernel and Skip Fusion","summary":"  The U-shaped architecture has emerged as a crucial paradigm in the design of\nmedical image segmentation networks. However, due to the inherent local\nlimitations of convolution, a fully convolutional segmentation network with\nU-shaped architecture struggles to effectively extract global context\ninformation, which is vital for the precise localization of lesions. While\nhybrid architectures combining CNNs and Transformers can address these issues,\ntheir application in real medical scenarios is limited due to the computational\nresource constraints imposed by the environment and edge devices. In addition,\nthe convolutional inductive bias in lightweight networks adeptly fits the\nscarce medical data, which is lacking in the Transformer based network. In\norder to extract global context information while taking advantage of the\ninductive bias, we propose CMUNeXt, an efficient fully convolutional\nlightweight medical image segmentation network, which enables fast and accurate\nauxiliary diagnosis in real scene scenarios. CMUNeXt leverages large kernel and\ninverted bottleneck design to thoroughly mix distant spatial and location\ninformation, efficiently extracting global context information. We also\nintroduce the Skip-Fusion block, designed to enable smooth skip-connections and\nensure ample feature fusion. Experimental results on multiple medical image\ndatasets demonstrate that CMUNeXt outperforms existing heavyweight and\nlightweight medical image segmentation networks in terms of segmentation\nperformance, while offering a faster inference speed, lighter weights, and a\nreduced computational cost. The code is available at\nhttps://github.com/FengheTan9/CMUNeXt.\n","authors":["Fenghe Tang","Jianrui Ding","Lingtao Wang","Chunping Ning","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.01239v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2301.06267v4","updated":"2023-08-03T01:56:35Z","published":"2023-01-16T05:40:42Z","title":"Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with\n  Multimodal Models","summary":"  The ability to quickly learn a new task with minimal instruction - known as\nfew-shot learning - is a central aspect of intelligent agents. Classical\nfew-shot benchmarks make use of few-shot samples from a single modality, but\nsuch samples may not be sufficient to characterize an entire concept class. In\ncontrast, humans use cross-modal information to learn new concepts efficiently.\nIn this work, we demonstrate that one can indeed build a better ${\\bf visual}$\ndog classifier by ${\\bf read}$ing about dogs and ${\\bf listen}$ing to them\nbark. To do so, we exploit the fact that recent multimodal foundation models\nsuch as CLIP are inherently cross-modal, mapping different modalities to the\nsame representation space. Specifically, we propose a simple cross-modal\nadaptation approach that learns from few-shot examples spanning different\nmodalities. By repurposing class names as additional one-shot training samples,\nwe achieve SOTA results with an embarrassingly simple linear classifier for\nvision-language adaptation. Furthermore, we show that our approach can benefit\nexisting methods such as prefix tuning, adapters, and classifier ensembling.\nFinally, to explore other modalities beyond vision and language, we construct\nthe first (to our knowledge) audiovisual few-shot benchmark and use cross-modal\ntraining to improve the performance of both image and audio classification.\n","authors":["Zhiqiu Lin","Samuel Yu","Zhiyi Kuang","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2301.06267v4.pdf","comment":"CVPR 2023. Project website:\n  https://linzhiqiu.github.io/papers/cross_modal/"},{"id":"http://arxiv.org/abs/2308.01499v1","updated":"2023-08-03T01:50:48Z","published":"2023-08-03T01:50:48Z","title":"TDMD: A Database for Dynamic Color Mesh Subjective and Objective Quality\n  Explorations","summary":"  Dynamic colored meshes (DCM) are widely used in various applications;\nhowever, these meshes may undergo different processes, such as compression or\ntransmission, which can distort them and degrade their quality. To facilitate\nthe development of objective metrics for DCMs and study the influence of\ntypical distortions on their perception, we create the Tencent - dynamic\ncolored mesh database (TDMD) containing eight reference DCM objects with six\ntypical distortions. Using processed video sequences (PVS) derived from the\nDCM, we have conducted a large-scale subjective experiment that resulted in 303\ndistorted DCM samples with mean opinion scores, making the TDMD the largest\navailable DCM database to our knowledge. This database enabled us to study the\nimpact of different types of distortion on human perception and offer\nrecommendations for DCM compression and related tasks. Additionally, we have\nevaluated three types of state-of-the-art objective metrics on the TDMD,\nincluding image-based, point-based, and video-based metrics, on the TDMD. Our\nexperimental results highlight the strengths and weaknesses of each metric, and\nwe provide suggestions about the selection of metrics in practical DCM\napplications. The TDMD will be made publicly available at the following\nlocation: https://multimedia.tencent.com/resources/tdmd.\n","authors":["Qi Yang","Joel Jung","Timon Deschamps","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.01499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.11233v2","updated":"2023-08-03T01:44:49Z","published":"2020-11-23T06:34:07Z","title":"ROME: Robustifying Memory-Efficient NAS via Topology Disentanglement and\n  Gradient Accumulation","summary":"  Albeit being a prevalent architecture searching approach, differentiable\narchitecture search (DARTS) is largely hindered by its substantial memory cost\nsince the entire supernet resides in the memory. This is where the single-path\nDARTS comes in, which only chooses a single-path submodel at each step. While\nbeing memory-friendly, it also comes with low computational costs. Nonetheless,\nwe discover a critical issue of single-path DARTS that has not been primarily\nnoticed. Namely, it also suffers from severe performance collapse since too\nmany parameter-free operations like skip connections are derived, just like\nDARTS does. In this paper, we propose a new algorithm called RObustifying\nMemory-Efficient NAS (ROME) to give a cure. First, we disentangle the topology\nsearch from the operation search to make searching and evaluation consistent.\nWe then adopt Gumbel-Top2 reparameterization and gradient accumulation to\nrobustify the unwieldy bi-level optimization. We verify ROME extensively across\n15 benchmarks to demonstrate its effectiveness and robustness.\n","authors":["Xiaoxing Wang","Xiangxiang Chu","Yuda Fan","Zhexi Zhang","Bo Zhang","Xiaokang Yang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2011.11233v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.01483v1","updated":"2023-08-03T00:42:30Z","published":"2023-08-03T00:42:30Z","title":"Efficient neural supersampling on a novel gaming dataset","summary":"  Real-time rendering for video games has become increasingly challenging due\nto the need for higher resolutions, framerates and photorealism. Supersampling\nhas emerged as an effective solution to address this challenge. Our work\nintroduces a novel neural algorithm for supersampling rendered content that is\n4 times more efficient than existing methods while maintaining the same level\nof accuracy. Additionally, we introduce a new dataset which provides auxiliary\nmodalities such as motion vectors and depth generated using graphics rendering\nfeatures like viewport jittering and mipmap biasing at different resolutions.\nWe believe that this dataset fills a gap in the current dataset landscape and\ncan serve as a valuable resource to help measure progress in the field and\nadvance the state-of-the-art in super-resolution techniques for gaming content.\n","authors":["Antoine Mercier","Ruan Erasmus","Yashesh Savani","Manik Dhingra","Fatih Porikli","Guillaume Berger"],"pdf_url":"https://arxiv.org/pdf/2308.01483v1.pdf","comment":"ICCV'23"},{"id":"http://arxiv.org/abs/2303.10741v2","updated":"2023-08-03T00:21:03Z","published":"2023-03-19T19:09:41Z","title":"Computer Vision Estimation of Emotion Reaction Intensity in the Wild","summary":"  Emotions play an essential role in human communication. Developing computer\nvision models for automatic recognition of emotion expression can aid in a\nvariety of domains, including robotics, digital behavioral healthcare, and\nmedia analytics. There are three types of emotional representations which are\ntraditionally modeled in affective computing research: Action Units, Valence\nArousal (VA), and Categorical Emotions. As part of an effort to move beyond\nthese representations towards more fine-grained labels, we describe our\nsubmission to the newly introduced Emotional Reaction Intensity (ERI)\nEstimation challenge in the 5th competition for Affective Behavior Analysis\nin-the-Wild (ABAW). We developed four deep neural networks trained in the\nvisual domain and a multimodal model trained with both visual and audio\nfeatures to predict emotion reaction intensity. Our best performing model on\nthe Hume-Reaction dataset achieved an average Pearson correlation coefficient\nof 0.4080 on the test set using a pre-trained ResNet50 model. This work\nprovides a first step towards the development of production-grade models which\npredict emotion reaction intensities rather than discrete emotion categories.\n","authors":["Yang Qian","Ali Kargarandehkordi","Onur Cezmi Mutlu","Saimourya Surabhi","Mohammadmahdi Honarmand","Dennis Paul Wall","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2303.10741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02084v1","updated":"2023-08-03T23:55:17Z","published":"2023-08-03T23:55:17Z","title":"Efficient Model Adaptation for Continual Learning at the Edge","summary":"  Most machine learning (ML) systems assume stationary and matching data\ndistributions during training and deployment. This is often a false assumption.\nWhen ML models are deployed on real devices, data distributions often shift\nover time due to changes in environmental factors, sensor characteristics, and\ntask-of-interest. While it is possible to have a human-in-the-loop to monitor\nfor distribution shifts and engineer new architectures in response to these\nshifts, such a setup is not cost-effective. Instead, non-stationary automated\nML (AutoML) models are needed. This paper presents the\nEncoder-Adaptor-Reconfigurator (EAR) framework for efficient continual learning\nunder domain shifts. The EAR framework uses a fixed deep neural network (DNN)\nfeature encoder and trains shallow networks on top of the encoder to handle\nnovel data. The EAR framework is capable of 1) detecting when new data is\nout-of-distribution (OOD) by combining DNNs with hyperdimensional computing\n(HDC), 2) identifying low-parameter neural adaptors to adapt the model to the\nOOD data using zero-shot neural architecture search (ZS-NAS), and 3) minimizing\ncatastrophic forgetting on previous tasks by progressively growing the neural\narchitecture as needed and dynamically routing data through the appropriate\nadaptors and reconfigurators for handling domain-incremental and\nclass-incremental continual learning. We systematically evaluate our approach\non several benchmark datasets for domain adaptation and demonstrate strong\nperformance compared to state-of-the-art algorithms for OOD detection and\nfew-/zero-shot NAS.\n","authors":["Zachary A. Daniels","Jun Hu","Michael Lomnitz","Phil Miller","Aswin Raghavan","Joe Zhang","Michael Piacentino","David Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02084v1.pdf","comment":"Under Review w/ IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.02066v1","updated":"2023-08-03T22:34:16Z","published":"2023-08-03T22:34:16Z","title":"Mitigating Task Interference in Multi-Task Learning via Explicit Task\n  Routing with Non-Learnable Primitives","summary":"  Multi-task learning (MTL) seeks to learn a single model to accomplish\nmultiple tasks by leveraging shared information among the tasks. Existing MTL\nmodels, however, have been known to suffer from negative interference among\ntasks. Efforts to mitigate task interference have focused on either\nloss/gradient balancing or implicit parameter partitioning with partial\noverlaps among the tasks. In this paper, we propose ETR-NLP to mitigate task\ninterference through a synergistic combination of non-learnable primitives\n(NLPs) and explicit task routing (ETR). Our key idea is to employ non-learnable\nprimitives to extract a diverse set of task-agnostic features and recombine\nthem into a shared branch common to all tasks and explicit task-specific\nbranches reserved for each task. The non-learnable primitives and the explicit\ndecoupling of learnable parameters into shared and task-specific ones afford\nthe flexibility needed for minimizing task interference. We evaluate the\nefficacy of ETR-NLP networks for both image-level classification and\npixel-level dense prediction MTL problems. Experimental results indicate that\nETR-NLP significantly outperforms state-of-the-art baselines with fewer\nlearnable parameters and similar FLOPs across all datasets. Code is available\nat this \\href{https://github.com/zhichao-lu/etr-nlp-mtl}.\n","authors":["Chuntao Ding","Zhichao Lu","Shangguang Wang","Ran Cheng","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2308.02066v1.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2308.02065v1","updated":"2023-08-03T22:21:04Z","published":"2023-08-03T22:21:04Z","title":"On the Biometric Capacity of Generative Face Models","summary":"  There has been tremendous progress in generating realistic faces with high\nfidelity over the past few years. Despite this progress, a crucial question\nremains unanswered: \"Given a generative face model, how many unique identities\ncan it generate?\" In other words, what is the biometric capacity of the\ngenerative face model? A scientific basis for answering this question will\nbenefit evaluating and comparing different generative face models and establish\nan upper bound on their scalability. This paper proposes a statistical approach\nto estimate the biometric capacity of generated face images in a hyperspherical\nfeature space. We employ our approach on multiple generative models, including\nunconditional generators like StyleGAN, Latent Diffusion Model, and \"Generated\nPhotos,\" as well as DCFace, a class-conditional generator. We also estimate\ncapacity w.r.t. demographic attributes such as gender and age. Our capacity\nestimates indicate that (a) under ArcFace representation at a false acceptance\nrate (FAR) of 0.1%, StyleGAN3 and DCFace have a capacity upper bound of\n$1.43\\times10^6$ and $1.190\\times10^4$, respectively; (b) the capacity reduces\ndrastically as we lower the desired FAR with an estimate of $1.796\\times10^4$\nand $562$ at FAR of 1% and 10%, respectively, for StyleGAN3; (c) there is no\ndiscernible disparity in the capacity w.r.t gender; and (d) for some generative\nmodels, there is an appreciable disparity in the capacity w.r.t age. Code is\navailable at https://github.com/human-analysis/capacity-generative-face-models.\n","authors":["Vishnu Naresh Boddeti","Gautam Sreekumar","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2308.02065v1.pdf","comment":"IJCB 2023"},{"id":"http://arxiv.org/abs/2308.02062v1","updated":"2023-08-03T21:56:50Z","published":"2023-08-03T21:56:50Z","title":"Diffusion Models for Counterfactual Generation and Anomaly Detection in\n  Brain Images","summary":"  Segmentation masks of pathological areas are useful in many medical\napplications, such as brain tumour and stroke management. Moreover, healthy\ncounterfactuals of diseased images can be used to enhance radiologists'\ntraining files and to improve the interpretability of segmentation models. In\nthis work, we present a weakly supervised method to generate a healthy version\nof a diseased image and then use it to obtain a pixel-wise anomaly map. To do\nso, we start by considering a saliency map that approximately covers the\npathological areas, obtained with ACAT. Then, we propose a technique that\nallows to perform targeted modifications to these regions, while preserving the\nrest of the image. In particular, we employ a diffusion model trained on\nhealthy samples and combine Denoising Diffusion Probabilistic Model (DDPM) and\nDenoising Diffusion Implicit Model (DDIM) at each step of the sampling process.\nDDPM is used to modify the areas affected by a lesion within the saliency map,\nwhile DDIM guarantees reconstruction of the normal anatomy outside of it. The\ntwo parts are also fused at each timestep, to guarantee the generation of a\nsample with a coherent appearance and a seamless transition between edited and\nunedited parts. We verify that when our method is applied to healthy samples,\nthe input images are reconstructed without significant modifications. We\ncompare our approach with alternative weakly supervised methods on IST-3 for\nstroke lesion segmentation and on BraTS2021 for brain tumour segmentation,\nwhere we improve the DICE score of the best competing method from $0.6534$ to\n$0.7056$.\n","authors":["Alessandro Fontanella","Grant Mair","Joanna Wardlaw","Emanuele Trucco","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2308.02062v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.02046v1","updated":"2023-08-03T20:55:37Z","published":"2023-08-03T20:55:37Z","title":"UGainS: Uncertainty Guided Anomaly Instance Segmentation","summary":"  A single unexpected object on the road can cause an accident or may lead to\ninjuries. To prevent this, we need a reliable mechanism for finding anomalous\nobjects on the road. This task, called anomaly segmentation, can be a stepping\nstone to safe and reliable autonomous driving. Current approaches tackle\nanomaly segmentation by assigning an anomaly score to each pixel and by\ngrouping anomalous regions using simple heuristics. However, pixel grouping is\na limiting factor when it comes to evaluating the segmentation performance of\nindividual anomalous objects. To address the issue of grouping multiple anomaly\ninstances into one, we propose an approach that produces accurate anomaly\ninstance masks. Our approach centers on an out-of-distribution segmentation\nmodel for identifying uncertain regions and a strong generalist segmentation\nmodel for anomaly instances segmentation. We investigate ways to use uncertain\nregions to guide such a segmentation model to perform segmentation of anomalous\ninstances. By incorporating strong object priors from a generalist model we\nadditionally improve the per-pixel anomaly segmentation performance. Our\napproach outperforms current pixel-level anomaly segmentation methods,\nachieving an AP of 80.08% and 88.98% on the Fishyscapes Lost and Found and the\nRoadAnomaly validation sets respectively. Project page:\nhttps://vision.rwth-aachen.de/ugains\n","authors":["Alexey Nekrasov","Alexander Hermans","Lars Kuhnert","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2308.02046v1.pdf","comment":"Accepted for publication at GCPR 2023; Project page at\n  https://vision.rwth-aachen.de/ugains"},{"id":"http://arxiv.org/abs/2308.02027v1","updated":"2023-08-03T20:41:08Z","published":"2023-08-03T20:41:08Z","title":"ETran: Energy-Based Transferability Estimation","summary":"  This paper addresses the problem of ranking pre-trained models for object\ndetection and image classification. Selecting the best pre-trained model by\nfine-tuning is an expensive and time-consuming task. Previous works have\nproposed transferability estimation based on features extracted by the\npre-trained models. We argue that quantifying whether the target dataset is\nin-distribution (IND) or out-of-distribution (OOD) for the pre-trained model is\nan important factor in the transferability estimation. To this end, we propose\nETran, an energy-based transferability assessment metric, which includes three\nscores: 1) energy score, 2) classification score, and 3) regression score. We\nuse energy-based models to determine whether the target dataset is OOD or IND\nfor the pre-trained model. In contrast to the prior works, ETran is applicable\nto a wide range of tasks including classification, regression, and object\ndetection (classification+regression). This is the first work that proposes\ntransferability estimation for object detection task. Our extensive experiments\non four benchmarks and two tasks show that ETran outperforms previous works on\nobject detection and classification benchmarks by an average of 21% and 12%,\nrespectively, and achieves SOTA in transferability assessment.\n","authors":["Mohsen Gholami","Mohammad Akbari","Xinglu Wang","Behnam Kamranian","Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02000v1","updated":"2023-08-03T19:29:35Z","published":"2023-08-03T19:29:35Z","title":"On the Transition from Neural Representation to Symbolic Knowledge","summary":"  Bridging the huge disparity between neural and symbolic representation can\npotentially enable the incorporation of symbolic thinking into neural networks\nfrom essence. Motivated by how human gradually builds complex symbolic\nrepresentation from the prototype symbols that are learned through perception\nand environmental interactions. We propose a Neural-Symbolic Transitional\nDictionary Learning (TDL) framework that employs an EM algorithm to learn a\ntransitional representation of data that compresses high-dimension information\nof visual parts of an input into a set of tensors as neural variables and\ndiscover the implicit predicate structure in a self-supervised way. We\nimplement the framework with a diffusion model by regarding the decomposition\nof input as a cooperative game, then learn predicates by prototype clustering.\nWe additionally use RL enabled by the Markovian of diffusion models to further\ntune the learned prototypes by incorporating subjective factors. Extensive\nexperiments on 3 abstract compositional visual objects datasets that require\nthe model to segment parts without any visual features like texture, color, or\nshadows apart from shape and 3 neural/symbolic downstream tasks demonstrate the\nlearned representation enables interpretable decomposition of visual input and\nsmooth adaption to downstream tasks which are not available by existing\nmethods.\n","authors":["Junyan Cheng","Peter Chin"],"pdf_url":"https://arxiv.org/pdf/2308.02000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01119v2","updated":"2023-08-03T19:27:16Z","published":"2023-08-02T12:59:10Z","title":"Unlearning Spurious Correlations in Chest X-ray Classification","summary":"  Medical image classification models are frequently trained using training\ndatasets derived from multiple data sources. While leveraging multiple data\nsources is crucial for achieving model generalization, it is important to\nacknowledge that the diverse nature of these sources inherently introduces\nunintended confounders and other challenges that can impact both model accuracy\nand transparency. A notable confounding factor in medical image classification,\nparticularly in musculoskeletal image classification, is skeletal\nmaturation-induced bone growth observed during adolescence. We train a deep\nlearning model using a Covid-19 chest X-ray dataset and we showcase how this\ndataset can lead to spurious correlations due to unintended confounding\nregions. eXplanation Based Learning (XBL) is a deep learning approach that goes\nbeyond interpretability by utilizing model explanations to interactively\nunlearn spurious correlations. This is achieved by integrating interactive user\nfeedback, specifically feature annotations. In our study, we employed two\nnon-demanding manual feedback mechanisms to implement an XBL-based approach for\neffectively eliminating these spurious correlations. Our results underscore the\npromising potential of XBL in constructing robust models even in the presence\nof confounding factors.\n","authors":["Misgina Tsighe Hagos","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2308.01119v2.pdf","comment":"Accepted at the Discovery Science 2023 conference. arXiv admin note:\n  text overlap with arXiv:2307.06026"},{"id":"http://arxiv.org/abs/2308.01994v1","updated":"2023-08-03T19:13:48Z","published":"2023-08-03T19:13:48Z","title":"Explainable unsupervised multi-modal image registration using deep\n  networks","summary":"  Clinical decision making from magnetic resonance imaging (MRI) combines\ncomplementary information from multiple MRI sequences (defined as\n'modalities'). MRI image registration aims to geometrically 'pair' diagnoses\nfrom different modalities, time points and slices. Both intra- and\ninter-modality MRI registration are essential components in clinical MRI\nsettings. Further, an MRI image processing pipeline that can address both afine\nand non-rigid registration is critical, as both types of deformations may be\noccuring in real MRI data scenarios. Unlike image classification,\nexplainability is not commonly addressed in image registration deep learning\n(DL) methods, as it is challenging to interpet model-data behaviours against\ntransformation fields. To properly address this, we incorporate Grad-CAM-based\nexplainability frameworks in each major component of our unsupervised\nmulti-modal and multi-organ image registration DL methodology. We previously\ndemonstrated that we were able to reach superior performance (against the\ncurrent standard Syn method). In this work, we show that our DL model becomes\nfully explainable, setting the framework to generalise our approach on further\nmedical imaging data.\n","authors":["Chengjia Wang","Giorgos Papanastasiou"],"pdf_url":"https://arxiv.org/pdf/2308.01994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01982v1","updated":"2023-08-03T18:31:18Z","published":"2023-08-03T18:31:18Z","title":"Predicting Ki67, ER, PR, and HER2 Statuses from H&E-stained Breast\n  Cancer Images","summary":"  Despite the advances in machine learning and digital pathology, it is not yet\nclear if machine learning methods can accurately predict molecular information\nmerely from histomorphology. In a quest to answer this question, we built a\nlarge-scale dataset (185538 images) with reliable measurements for Ki67, ER,\nPR, and HER2 statuses. The dataset is composed of mirrored images of H\\&E and\ncorresponding images of immunohistochemistry (IHC) assays (Ki67, ER, PR, and\nHER2. These images are mirrored through registration. To increase reliability,\nindividual pairs were inspected and discarded if artifacts were present (tissue\nfolding, bubbles, etc). Measurements for Ki67, ER and PR were determined by\ncalculating H-Score from image analysis. HER2 measurement is based on binary\nclassification: 0 and 1+ (IHC scores representing a negative subset) vs 3+ (IHC\nscore positive subset). Cases with IHC equivocal score (2+) were excluded. We\nshow that a standard ViT-based pipeline can achieve prediction performances\naround 90% in terms of Area Under the Curve (AUC) when trained with a proper\nlabeling protocol. Finally, we shed light on the ability of the trained\nclassifiers to localize relevant regions, which encourages future work to\nimprove the localizations. Our proposed dataset is publicly available:\nhttps://ihc4bc.github.io/\n","authors":["Amir Akbarnejad","Nilanjan Ray","Penny J. Barnes","Gilbert Bigras"],"pdf_url":"https://arxiv.org/pdf/2308.01982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01981v1","updated":"2023-08-03T18:28:50Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n  morphometrics","summary":"  We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v1.pdf","comment":"To be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.01979v1","updated":"2023-08-03T18:21:38Z","published":"2023-08-03T18:21:38Z","title":"RealCQA: Scientific Chart Question Answering as a Test-bed for\n  First-Order Logic","summary":"  We present a comprehensive study of chart visual question-answering(QA) task,\nto address the challenges faced in comprehending and extracting data from chart\nvisualizations within documents. Despite efforts to tackle this problem using\nsynthetic charts, solutions are limited by the shortage of annotated real-world\ndata. To fill this gap, we introduce a benchmark and dataset for chart visual\nQA on real-world charts, offering a systematic analysis of the task and a novel\ntaxonomy for template-based chart question creation. Our contribution includes\nthe introduction of a new answer type, 'list', with both ranked and unranked\nvariations. Our study is conducted on a real-world chart dataset from\nscientific literature, showcasing higher visual complexity compared to other\nworks. Our focus is on template-based QA and how it can serve as a standard for\nevaluating the first-order logic capabilities of models. The results of our\nexperiments, conducted on a real-world out-of-distribution dataset, provide a\nrobust evaluation of large-scale pre-trained models and advance the field of\nchart visual QA and formal logic verification for neural networks in general.\n","authors":["Saleem Ahmed","Bhavin Jawade","Shubham Pandey","Srirangaraj Setlur","Venu Govindaraju"],"pdf_url":"https://arxiv.org/pdf/2308.01979v1.pdf","comment":"This a pre-print version. Accepted at ICDAR '23"},{"id":"http://arxiv.org/abs/2308.01971v1","updated":"2023-08-03T18:03:42Z","published":"2023-08-03T18:03:42Z","title":"SpaDen : Sparse and Dense Keypoint Estimation for Real-World Chart\n  Understanding","summary":"  We introduce a novel bottom-up approach for the extraction of chart data. Our\nmodel utilizes images of charts as inputs and learns to detect keypoints (KP),\nwhich are used to reconstruct the components within the plot area. Our novelty\nlies in detecting a fusion of continuous and discrete KP as predicted heatmaps.\nA combination of sparse and dense per-pixel objectives coupled with a uni-modal\nself-attention-based feature-fusion layer is applied to learn KP embeddings.\nFurther leveraging deep metric learning for unsupervised clustering, allows us\nto segment the chart plot area into various objects. By further matching the\nchart components to the legend, we are able to obtain the data series names. A\npost-processing threshold is applied to the KP embeddings to refine the object\nreconstructions and improve accuracy. Our extensive experiments include an\nevaluation of different modules for KP estimation and the combination of deep\nlayer aggregation and corner pooling approaches. The results of our experiments\nprovide extensive evaluation for the task of real-world chart data extraction.\n","authors":["Saleem Ahmed","Pengyu Yan","David Doermann","Srirangaraj Setlur","Venu Govindaraju"],"pdf_url":"https://arxiv.org/pdf/2308.01971v1.pdf","comment":"Accepted ORAL at ICDAR 23"},{"id":"http://arxiv.org/abs/2308.02299v1","updated":"2023-08-03T14:17:22Z","published":"2023-08-03T14:17:22Z","title":"RegionBLIP: A Unified Multi-modal Pre-training Framework for Holistic\n  and Regional Comprehension","summary":"  In this work, we investigate extending the comprehension of Multi-modal Large\nLanguage Models (MLLMs) to regional objects. To this end, we propose to extract\nfeatures corresponding to regional objects as soft prompts for LLM, which\nprovides a straightforward and scalable approach and eliminates the need for\nLLM fine-tuning. To effectively extract regional features from regular image\nfeatures and irregular point cloud features, we present a novel and unified\nposition-assisted feature extraction module. Furthermore, training an MLLM from\nscratch is highly time-consuming. Thus, we propose incrementally extending\nexisting pre-trained MLLMs to comprehend more modalities and the regional\nobjects of those modalities. Specifically, we freeze the Q-Former from BLIP-2,\nan impressive MLLM, and optimize the modality-specific Lora parameters in\nQ-Former and LLM for each newly introduced modality. The freezing of the\nQ-Former eliminates the need for extensive pre-training on massive image-text\ndata. The freezed Q-Former pre-trained from massive image-text data is also\nbeneficial for the pre-training on image-region-text data. We name our\nframework RegionBLIP. We pre-train RegionBLIP on image-region-text,\npoint-cloud-text, and point-cloud-region-text data. Experimental results verify\nthat \\Ours{} can preserve the image comprehension capability of BILP-2 and\nfurther gain a comprehension of the newly introduced point cloud modality and\nregional objects. The Data, Code, and Pre-trained models will be available at\nhttps://github.com/mightyzau/RegionBLIP.\n","authors":["Qiang Zhou","Chaohui Yu","Shaofeng Zhang","Sitong Wu","Zhibing Wang","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01948v1","updated":"2023-08-03T09:03:40Z","published":"2023-08-03T09:03:40Z","title":"A Multidimensional Analysis of Social Biases in Vision Transformers","summary":"  The embedding spaces of image models have been shown to encode a range of\nsocial biases such as racism and sexism. Here, we investigate specific factors\nthat contribute to the emergence of these biases in Vision Transformers (ViT).\nTherefore, we measure the impact of training data, model architecture, and\ntraining objectives on social biases in the learned representations of ViTs.\nOur findings indicate that counterfactual augmentation training using\ndiffusion-based image editing can mitigate biases, but does not eliminate them.\nMoreover, we find that larger models are less biased than smaller models, and\nthat models trained using discriminative objectives are less biased than those\ntrained using generative objectives. In addition, we observe inconsistencies in\nthe learned social biases. To our surprise, ViTs can exhibit opposite biases\nwhen trained on the same data set using different self-supervised objectives.\nOur findings give insights into the factors that contribute to the emergence of\nsocial biases and suggests that we could achieve substantial fairness\nimprovements based on model design choices.\n","authors":["Jannik Brinkmann","Paul Swoboda","Christian Bartelt"],"pdf_url":"https://arxiv.org/pdf/2308.01948v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.01737v1","updated":"2023-08-03T12:55:55Z","published":"2023-08-03T12:55:55Z","title":"MAP: A Model-agnostic Pretraining Framework for Click-through Rate\n  Prediction","summary":"  With the widespread application of personalized online services,\nclick-through rate (CTR) prediction has received more and more attention and\nresearch. The most prominent features of CTR prediction are its multi-field\ncategorical data format, and vast and daily-growing data volume. The large\ncapacity of neural models helps digest such massive amounts of data under the\nsupervised learning paradigm, yet they fail to utilize the substantial data to\nits full potential, since the 1-bit click signal is not sufficient to guide the\nmodel to learn capable representations of features and instances. The\nself-supervised learning paradigm provides a more promising pretrain-finetune\nsolution to better exploit the large amount of user click logs, and learn more\ngeneralized and effective representations. However, self-supervised learning\nfor CTR prediction is still an open question, since current works on this line\nare only preliminary and rudimentary. To this end, we propose a Model-agnostic\npretraining (MAP) framework that applies feature corruption and recovery on\nmulti-field categorical data, and more specifically, we derive two practical\nalgorithms: masked feature prediction (MFP) and replaced feature detection\n(RFD). MFP digs into feature interactions within each instance through masking\nand predicting a small portion of input features, and introduces noise\ncontrastive estimation (NCE) to handle large feature spaces. RFD further turns\nMFP into a binary classification mode through replacing and detecting changes\nin input features, making it even simpler and more effective for CTR\npretraining. Our extensive experiments on two real-world large-scale datasets\n(i.e., Avazu, Criteo) demonstrate the advantages of these two methods on\nseveral strong backbones (e.g., DCNv2, DeepFM), and achieve new\nstate-of-the-art performance in terms of both effectiveness and efficiency for\nCTR prediction.\n","authors":["Jianghao Lin","Yanru Qu","Wei Guo","Xinyi Dai","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01737v1.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2308.01666v1","updated":"2023-08-03T10:11:42Z","published":"2023-08-03T10:11:42Z","title":"Evaluating ChatGPT text-mining of clinical records for obesity\n  monitoring","summary":"  Background: Veterinary clinical narratives remain a largely untapped resource\nfor addressing complex diseases. Here we compare the ability of a large\nlanguage model (ChatGPT) and a previously developed regular expression (RegexT)\nto identify overweight body condition scores (BCS) in veterinary narratives.\nMethods: BCS values were extracted from 4,415 anonymised clinical narratives\nusing either RegexT or by appending the narrative to a prompt sent to ChatGPT\ncoercing the model to return the BCS information. Data were manually reviewed\nfor comparison. Results: The precision of RegexT was higher (100%, 95% CI\n94.81-100%) than the ChatGPT (89.3%; 95% CI82.75-93.64%). However, the recall\nof ChatGPT (100%. 95% CI 96.18-100%) was considerably higher than that of\nRegexT (72.6%, 95% CI 63.92-79.94%). Limitations: Subtle prompt engineering is\nneeded to improve ChatGPT output. Conclusions: Large language models create\ndiverse opportunities and, whilst complex, present an intuitive interface to\ninformation but require careful implementation to avoid unpredictable errors.\n","authors":["Ivo S. Fins","Heather Davies","Sean Farrell","Jose R. Torres","Gina Pinchbeck","Alan D. Radford","Peter-John Noble"],"pdf_url":"https://arxiv.org/pdf/2308.01666v1.pdf","comment":"Supplementary Material: The data that support the findings of this\n  study are available in the ancillary files of this submission. 5 pages, 2\n  figures (textboxes)"},{"id":"http://arxiv.org/abs/2308.01566v1","updated":"2023-08-03T07:13:27Z","published":"2023-08-03T07:13:27Z","title":"Fast Slate Policy Optimization: Going Beyond Plackett-Luce","summary":"  An increasingly important building block of large scale machine learning\nsystems is based on returning slates; an ordered lists of items given a query.\nApplications of this technology include: search, information retrieval and\nrecommender systems. When the action space is large, decision systems are\nrestricted to a particular structure to complete online queries quickly. This\npaper addresses the optimization of these large scale decision systems given an\narbitrary reward function. We cast this learning problem in a policy\noptimization framework and propose a new class of policies, born from a novel\nrelaxation of decision functions. This results in a simple, yet efficient\nlearning algorithm that scales to massive action spaces. We compare our method\nto the commonly adopted Plackett-Luce policy class and demonstrate the\neffectiveness of our approach on problems with action space sizes in the order\nof millions.\n","authors":["Otmane Sakhi","David Rohde","Nicolas Chopin"],"pdf_url":"https://arxiv.org/pdf/2308.01566v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.01563v1","updated":"2023-08-03T07:06:35Z","published":"2023-08-03T07:06:35Z","title":"Density Weighting for Multi-Interest Personalized Recommendation","summary":"  Using multiple user representations (MUR) to model user behavior instead of a\nsingle user representation (SUR) has been shown to improve personalization in\nrecommendation systems. However, the performance gains observed with MUR can be\nsensitive to the skewness in the item and/or user interest distribution. When\nthe data distribution is highly skewed, the gains observed by learning multiple\nrepresentations diminish since the model dominates on head items/interests,\nleading to poor performance on tail items. Robustness to data sparsity is\ntherefore essential for MUR-based approaches to achieve good performance for\nrecommendations. Yet, research in MUR and data imbalance have largely been done\nindependently. In this paper, we delve deeper into the shortcomings of MUR\ninferred from imbalanced data distributions. We make several contributions: (1)\nUsing synthetic datasets, we demonstrate the sensitivity of MUR with respect to\ndata imbalance, (2) To improve MUR for tail items, we propose an iterative\ndensity weighting scheme (IDW) with user tower calibration to mitigate the\neffect of training over long-tail distribution on personalization, and (3)\nThrough extensive experiments on three real-world benchmarks, we demonstrate\nIDW outperforms other alternatives that address data imbalance.\n","authors":["Nikhil Mehta","Anima Singh","Xinyang Yi","Sagar Jain","Lichan Hong","Ed H. Chi"],"pdf_url":"https://arxiv.org/pdf/2308.01563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01577v3","updated":"2023-08-03T02:30:02Z","published":"2023-04-04T07:06:54Z","title":"Form-NLU: Dataset for the Form Natural Language Understanding","summary":"  Compared to general document analysis tasks, form document structure\nunderstanding and retrieval are challenging. Form documents are typically made\nby two types of authors; A form designer, who develops the form structure and\nkeys, and a form user, who fills out form values based on the provided keys.\nHence, the form values may not be aligned with the form designer's intention\n(structure and keys) if a form user gets confused. In this paper, we introduce\nForm-NLU, the first novel dataset for form structure understanding and its key\nand value information extraction, interpreting the form designer's intent and\nthe alignment of user-written value on it. It consists of 857 form images, 6k\nform keys and values, and 4k table keys and values. Our dataset also includes\nthree form types: digital, printed, and handwritten, which cover diverse form\nappearances and layouts. We propose a robust positional and logical\nrelation-based form key-value information extraction framework. Using this\ndataset, Form-NLU, we first examine strong object detection models for the form\nlayout understanding, then evaluate the key information extraction task on the\ndataset, providing fine-grained results for different types of forms and keys.\nFurthermore, we examine it with the off-the-shelf pdf layout extraction tool\nand prove its feasibility in real-world cases.\n","authors":["Yihao Ding","Siqu Long","Jiabin Huang","Kaixuan Ren","Xingxiang Luo","Hyunsuk Chung","Soyeon Caren Han"],"pdf_url":"https://arxiv.org/pdf/2304.01577v3.pdf","comment":"Accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2308.02058v1","updated":"2023-08-03T21:34:00Z","published":"2023-08-03T21:34:00Z","title":"Incorporating Recklessness to Collaborative Filtering based Recommender\n  Systems","summary":"  Recommender systems that include some reliability measure of their\npredictions tend to be more conservative in forecasting, due to their\nconstraint to preserve reliability. This leads to a significant drop in the\ncoverage and novelty that these systems can provide. In this paper, we propose\nthe inclusion of a new term in the learning process of matrix\nfactorization-based recommender systems, called recklessness, which enables the\ncontrol of the risk level desired when making decisions about the reliability\nof a prediction. Experimental results demonstrate that recklessness not only\nallows for risk regulation but also improves the quantity and quality of\npredictions provided by the recommender system.\n","authors":["Diego Pérez-López","Fernando Ortega","Ángel González-Prieto","Jorge Dueñas-Lerín"],"pdf_url":"https://arxiv.org/pdf/2308.02058v1.pdf","comment":"15 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.02055v1","updated":"2023-08-03T21:14:25Z","published":"2023-08-03T21:14:25Z","title":"Seasonality Based Reranking of E-commerce Autocomplete Using Natural\n  Language Queries","summary":"  Query autocomplete (QAC) also known as typeahead, suggests list of complete\nqueries as user types prefix in the search box. It is one of the key features\nof modern search engines specially in e-commerce. One of the goals of typeahead\nis to suggest relevant queries to users which are seasonally important. In this\npaper we propose a neural network based natural language processing (NLP)\nalgorithm to incorporate seasonality as a signal and present end to end\nevaluation of the QAC ranking model. Incorporating seasonality into\nautocomplete ranking model can improve autocomplete relevance and business\nmetric.\n","authors":["Prateek Verma","Shan Zhong","Xiaoyu Liu","Adithya Rajan"],"pdf_url":"https://arxiv.org/pdf/2308.02055v1.pdf","comment":"Accepted at The 6th Workshop on e-Commerce and NLP (ECNLP 6), KDD'23,\n  Long Beach, CA"},{"id":"http://arxiv.org/abs/2308.01976v1","updated":"2023-08-03T18:11:00Z","published":"2023-08-03T18:11:00Z","title":"Domain specificity and data efficiency in typo tolerant spell checkers:\n  the case of search in online marketplaces","summary":"  Typographical errors are a major source of frustration for visitors of online\nmarketplaces. Because of the domain-specific nature of these marketplaces and\nthe very short queries users tend to search for, traditional spell cheking\nsolutions do not perform well in correcting typos. We present a data\naugmentation method to address the lack of annotated typo data and train a\nrecurrent neural network to learn context-limited domain-specific embeddings.\nThose embeddings are deployed in a real-time inferencing API for the Microsoft\nAppSource marketplace to find the closest match between a misspelled user query\nand the available product names. Our data efficient solution shows that\ncontrolled high quality synthetic data may be a powerful tool especially\nconsidering the current climate of large language models which rely on\nprohibitively huge and often uncontrolled datasets.\n","authors":["Dayananda Ubrangala","Juhi Sharma","Ravi Prasad Kondapalli","Kiran R","Amit Agarwala","Laurent Boué"],"pdf_url":"https://arxiv.org/pdf/2308.01976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02580v1","updated":"2023-08-03T16:13:46Z","published":"2023-08-03T16:13:46Z","title":"Probabilistic Deep Supervision Network: A Noise-Resilient Approach for\n  QoS Prediction","summary":"  Quality of Service (QoS) prediction is an essential task in recommendation\nsystems, where accurately predicting unknown QoS values can improve user\nsatisfaction. However, existing QoS prediction techniques may perform poorly in\nthe presence of noise data, such as fake location information or virtual\ngateways. In this paper, we propose the Probabilistic Deep Supervision Network\n(PDS-Net), a novel framework for QoS prediction that addresses this issue.\nPDS-Net utilizes a Gaussian-based probabilistic space to supervise intermediate\nlayers and learns probability spaces for both known features and true labels.\nMoreover, PDS-Net employs a condition-based multitasking loss function to\nidentify objects with noise data and applies supervision directly to deep\nfeatures sampled from the probability space by optimizing the Kullback-Leibler\ndistance between the probability space of these objects and the real-label\nprobability space. Thus, PDS-Net effectively reduces errors resulting from the\npropagation of corrupted data, leading to more accurate QoS predictions.\nExperimental evaluations on two real-world QoS datasets demonstrate that the\nproposed PDS-Net outperforms state-of-the-art baselines, validating the\neffectiveness of our approach.\n","authors":["Ziliang Wang","Xiaohong Zhang","Sheng Huang","Wei Zhang","Dan Yang","Meng Yan"],"pdf_url":"https://arxiv.org/pdf/2308.02580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02571v1","updated":"2023-08-03T11:28:12Z","published":"2023-08-03T11:28:12Z","title":"ADRNet: A Generalized Collaborative Filtering Framework Combining\n  Clinical and Non-Clinical Data for Adverse Drug Reaction Prediction","summary":"  Adverse drug reaction (ADR) prediction plays a crucial role in both health\ncare and drug discovery for reducing patient mortality and enhancing drug\nsafety. Recently, many studies have been devoted to effectively predict the\ndrug-ADRs incidence rates. However, these methods either did not effectively\nutilize non-clinical data, i.e., physical, chemical, and biological information\nabout the drug, or did little to establish a link between content-based and\npure collaborative filtering during the training phase. In this paper, we first\nformulate the prediction of multi-label ADRs as a drug-ADR collaborative\nfiltering problem, and to the best of our knowledge, this is the first work to\nprovide extensive benchmark results of previous collaborative filtering methods\non two large publicly available clinical datasets. Then, by exploiting the easy\naccessible drug characteristics from non-clinical data, we propose ADRNet, a\ngeneralized collaborative filtering framework combining clinical and\nnon-clinical data for drug-ADR prediction. Specifically, ADRNet has a shallow\ncollaborative filtering module and a deep drug representation module, which can\nexploit the high-dimensional drug descriptors to further guide the learning of\nlow-dimensional ADR latent embeddings, which incorporates both the benefits of\ncollaborative filtering and representation learning. Extensive experiments are\nconducted on two publicly available real-world drug-ADR clinical datasets and\ntwo non-clinical datasets to demonstrate the accuracy and efficiency of the\nproposed ADRNet. The code is available at\nhttps://github.com/haoxuanli-pku/ADRnet.\n","authors":["Haoxuan Li","Taojun Hu","Zetong Xiong","Chunyuan Zheng","Fuli Feng","Xiangnan He","Xiao-Hua Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.02571v1.pdf","comment":"RecSys '23"},{"id":"http://arxiv.org/abs/2308.02568v1","updated":"2023-08-03T08:56:24Z","published":"2023-08-03T08:56:24Z","title":"Weighted Multi-Level Feature Factorization for App ads CTR and\n  installation prediction","summary":"  This paper provides an overview of the approach we used as team ISISTANITOS\nfor the ACM RecSys Challenge 2023. The competition was organized by ShareChat,\nand involved predicting the probability of a user clicking an app ad and/or\ninstalling an app, to improve deep funnel optimization and a special focus on\nuser privacy. Our proposed method inferring the probabilities of clicking and\ninstalling as two different, but related tasks. Hence, the model engineers a\nspecific set of features for each task and a set of shared features. Our model\nis called Weighted Multi-Level Feature Factorization because it considers the\ninteraction of different order features, where the order is associated to the\ndepth in a neural network. The prediction for a given task is generated by\ncombining the task specific and shared features on the different levels. Our\nsubmission achieved the 11 rank and overall score of 55 in the competition\nacademia-track final results. We release our source code at:\nhttps://github.com/knife982000/RecSys2023Challenge\n","authors":["Juan Manuel Rodriguez","Antonela Tommasel"],"pdf_url":"https://arxiv.org/pdf/2308.02568v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.01906v1","updated":"2023-08-03T17:59:27Z","published":"2023-08-03T17:59:27Z","title":"Reasoning in Large Language Models Through Symbolic Math Word Problems","summary":"  Large language models (LLMs) have revolutionized NLP by solving downstream\ntasks with little to no labeled data. Despite their versatile abilities, the\nlarger question of their ability to reason remains ill-understood. This paper\naddresses reasoning in math word problems (MWPs) by studying symbolic versions\nof the numeric problems, since a symbolic expression is a \"concise explanation\"\nof the numeric answer. We create and use a symbolic version of the SVAMP\ndataset and find that GPT-3's davinci-002 model also has good zero-shot\naccuracy on symbolic MWPs. To evaluate the faithfulness of the model's\nreasoning, we go beyond accuracy and additionally evaluate the alignment\nbetween the final answer and the outputted reasoning, which correspond to\nnumeric and symbolic answers respectively for MWPs. We explore a self-prompting\napproach to encourage the symbolic reasoning to align with the numeric answer,\nthus equipping the LLM with the ability to provide a concise and verifiable\nreasoning and making it more interpretable. Surprisingly, self-prompting also\nimproves the symbolic accuracy to be higher than both the numeric and symbolic\naccuracies, thus providing an ensembling effect. The SVAMP_Sym dataset will be\nreleased for future research on symbolic math problems.\n","authors":["Vedant Gaur","Nikunj Saunshi"],"pdf_url":"https://arxiv.org/pdf/2308.01906v1.pdf","comment":"Accepted at the Findings of ACL 2023"},{"id":"http://arxiv.org/abs/2308.01905v1","updated":"2023-08-03T17:59:06Z","published":"2023-08-03T17:59:06Z","title":"Revisiting Deformable Convolution for Depth Completion","summary":"  Depth completion, which aims to generate high-quality dense depth maps from\nsparse depth maps, has attracted increasing attention in recent years. Previous\nwork usually employs RGB images as guidance, and introduces iterative spatial\npropagation to refine estimated coarse depth maps. However, most of the\npropagation refinement methods require several iterations and suffer from a\nfixed receptive field, which may contain irrelevant and useless information\nwith very sparse input. In this paper, we address these two challenges\nsimultaneously by revisiting the idea of deformable convolution. We propose an\neffective architecture that leverages deformable kernel convolution as a\nsingle-pass refinement module, and empirically demonstrate its superiority. To\nbetter understand the function of deformable convolution and exploit it for\ndepth completion, we further systematically investigate a variety of\nrepresentative strategies. Our study reveals that, different from prior work,\ndeformable convolution needs to be applied on an estimated depth map with a\nrelatively high density for better performance. We evaluate our model on the\nlarge-scale KITTI dataset and achieve state-of-the-art level performance in\nboth accuracy and inference speed. Our code is available at\nhttps://github.com/AlexSunNik/ReDC.\n","authors":["Xinglong Sun","Jean Ponce","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.01905v1.pdf","comment":"Accepted and going to appear at IROS2023"},{"id":"http://arxiv.org/abs/2308.01899v1","updated":"2023-08-03T17:56:16Z","published":"2023-08-03T17:56:16Z","title":"How many preprints have actually been printed and why: a case study of\n  computer science preprints on arXiv","summary":"  Preprints play an increasingly critical role in academic communities. There\nare many reasons driving researchers to post their manuscripts to preprint\nservers before formal submission to journals or conferences, but the use of\npreprints has also sparked considerable controversy, especially surrounding the\nclaim of priority. In this paper, a case study of computer science preprints\nsubmitted to arXiv from 2008 to 2017 is conducted to quantify how many\npreprints have eventually been printed in peer-reviewed venues. Among those\npublished manuscripts, some are published under different titles and without an\nupdate to their preprints on arXiv. In the case of these manuscripts, the\ntraditional fuzzy matching method is incapable of mapping the preprint to the\nfinal published version. In view of this issue, we introduce a semantics-based\nmapping method with the employment of Bidirectional Encoder Representations\nfrom Transformers (BERT). With this new mapping method and a plurality of data\nsources, we find that 66% of all sampled preprints are published under\nunchanged titles and 11% are published under different titles and with other\nmodifications. A further analysis was then performed to investigate why these\npreprints but not others were accepted for publication. Our comparison reveals\nthat in the field of computer science, published preprints feature adequate\nrevisions, multiple authorship, detailed abstract and introduction, extensive\nand authoritative references and available source code.\n","authors":["Jialiang Lin","Yao Yu","Yu Zhou","Zhiyang Zhou","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2308.01899v1.pdf","comment":"Please cite the version of Scientometrics"},{"id":"http://arxiv.org/abs/2308.01895v1","updated":"2023-08-03T17:46:27Z","published":"2023-08-03T17:46:27Z","title":"Improving Replay Sample Selection and Storage for Less Forgetting in\n  Continual Learning","summary":"  Continual learning seeks to enable deep learners to train on a series of\ntasks of unknown length without suffering from the catastrophic forgetting of\nprevious tasks. One effective solution is replay, which involves storing few\nprevious experiences in memory and replaying them when learning the current\ntask. However, there is still room for improvement when it comes to selecting\nthe most informative samples for storage and determining the optimal number of\nsamples to be stored. This study aims to address these issues with a novel\ncomparison of the commonly used reservoir sampling to various alternative\npopulation strategies and providing a novel detailed analysis of how to find\nthe optimal number of stored samples.\n","authors":["Daniel Brignac","Niels Lobo","Abhijit Mahalanobis"],"pdf_url":"https://arxiv.org/pdf/2308.01895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01891v1","updated":"2023-08-03T17:37:18Z","published":"2023-08-03T17:37:18Z","title":"Exact identification of nonlinear dynamical systems by Trimmed Lasso","summary":"  Identification of nonlinear dynamical systems has been popularized by sparse\nidentification of the nonlinear dynamics (SINDy) via the sequentially\nthresholded least squares (STLS) algorithm. Many extensions SINDy have emerged\nin the literature to deal with experimental data which are finite in length and\nnoisy. Recently, the computationally intensive method of ensembling\nbootstrapped SINDy models (E-SINDy) was proposed for model identification,\nhandling finite, highly noisy data. While the extensions of SINDy are numerous,\ntheir sparsity-promoting estimators occasionally provide sparse approximations\nof the dynamics as opposed to exact recovery. Furthermore, these estimators\nsuffer under multicollinearity, e.g. the irrepresentable condition for the\nLasso. In this paper, we demonstrate that the Trimmed Lasso for robust\nidentification of models (TRIM) can provide exact recovery under more severe\nnoise, finite data, and multicollinearity as opposed to E-SINDy. Additionally,\nthe computational cost of TRIM is asymptotically equal to STLS since the\nsparsity parameter of the TRIM can be solved efficiently by convex solvers. We\ncompare these methodologies on challenging nonlinear systems, specifically the\nLorenz 63 system, the Bouc Wen oscillator from the nonlinear dynamics benchmark\nof No\\\"el and Schoukens, 2016, and a time delay system describing tool cutting\ndynamics. This study emphasizes the comparisons between STLS, reweighted\n$\\ell_1$ minimization, and Trimmed Lasso in identification with respect to\nproblems faced by practitioners: the problem of finite and noisy data, the\nperformance of the sparse regression of when the library grows in dimension\n(multicollinearity), and automatic methods for choice of regularization\nparameters.\n","authors":["Shawn L. Kiser","Mikhail Guskov","Marc Rébillat","Nicolas Ranc"],"pdf_url":"https://arxiv.org/pdf/2308.01891v1.pdf","comment":"24 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.01890v1","updated":"2023-08-03T17:33:20Z","published":"2023-08-03T17:33:20Z","title":"DualCoOp++: Fast and Effective Adaptation to Multi-Label Recognition\n  with Limited Annotations","summary":"  Multi-label image recognition in the low-label regime is a task of great\nchallenge and practical significance. Previous works have focused on learning\nthe alignment between textual and visual spaces to compensate for limited image\nlabels, yet may suffer from reduced accuracy due to the scarcity of\nhigh-quality multi-label annotations. In this research, we leverage the\npowerful alignment between textual and visual features pretrained with millions\nof auxiliary image-text pairs. We introduce an efficient and effective\nframework called Evidence-guided Dual Context Optimization (DualCoOp++), which\nserves as a unified approach for addressing partial-label and zero-shot\nmulti-label recognition. In DualCoOp++ we separately encode evidential,\npositive, and negative contexts for target classes as parametric components of\nthe linguistic input (i.e., prompts). The evidential context aims to discover\nall the related visual content for the target class, and serves as guidance to\naggregate positive and negative contexts from the spatial domain of the image,\nenabling better distinguishment between similar categories. Additionally, we\nintroduce a Winner-Take-All module that promotes inter-class interaction during\ntraining, while avoiding the need for extra parameters and costs. As DualCoOp++\nimposes minimal additional learnable overhead on the pretrained vision-language\nframework, it enables rapid adaptation to multi-label recognition tasks with\nlimited annotations and even unseen classes. Experiments on standard\nmulti-label recognition benchmarks across two challenging low-label settings\ndemonstrate the superior performance of our approach compared to\nstate-of-the-art methods.\n","authors":["Ping Hu","Ximeng Sun","Stan Sclaroff","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2308.01890v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible. arXiv admin note: substantial text overlap with\n  arXiv:2206.09541"},{"id":"http://arxiv.org/abs/2306.05357v2","updated":"2023-08-03T17:07:41Z","published":"2023-06-08T17:02:15Z","title":"Unsupervised Compositional Concepts Discovery with Text-to-Image\n  Generative Models","summary":"  Text-to-image generative models have enabled high-resolution image synthesis\nacross different domains, but require users to specify the content they wish to\ngenerate. In this paper, we consider the inverse problem -- given a collection\nof different images, can we discover the generative concepts that represent\neach image? We present an unsupervised approach to discover generative concepts\nfrom a collection of images, disentangling different art styles in paintings,\nobjects, and lighting from kitchen scenes, and discovering image classes given\nImageNet images. We show how such generative concepts can accurately represent\nthe content of images, be recombined and composed to generate new artistic and\nhybrid images, and be further used as a representation for downstream\nclassification tasks.\n","authors":["Nan Liu","Yilun Du","Shuang Li","Joshua B. Tenenbaum","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2306.05357v2.pdf","comment":"ICCV 2023. Project Webpage:\n  https://energy-based-model.github.io/unsupervised-concept-discovery/"},{"id":"http://arxiv.org/abs/2206.07944v4","updated":"2023-08-03T17:02:15Z","published":"2022-06-16T06:29:51Z","title":"Distributed Online Private Learning of Convex Nondecomposable Objectives","summary":"  We deal with a general distributed constrained online learning problem with\nprivacy over time-varying networks, where a class of nondecomposable objectives\nare considered. Under this setting, each node only controls a part of the\nglobal decision, and the goal of all nodes is to collaboratively minimize the\nglobal cost over a time horizon $T$ while guarantees the security of the\ntransmitted information. For such problems, we first design a novel generic\nalgorithm framework, named as DPSDA, of differentially private distributed\nonline learning using the Laplace mechanism and the stochastic variants of dual\naveraging method. Note that in the dual updates, all nodes of DPSDA employ the\nnoise-corrupted gradients for more generality. Then, we propose two algorithms,\nnamed as DPSDA-C and DPSDA-PS, under this framework. In DPSDA-C, the nodes\nimplement a circulation-based communication in the primal updates so as to\nalleviate the disagreements over time-varying undirected networks. In addition,\nfor the extension to time-varying directed ones, the nodes implement the\nbroadcast-based push-sum dynamics in DPSDA-PS, which can achieve average\nconsensus over arbitrary directed networks. Theoretical results show that both\nalgorithms attain an expected regret upper bound in $\\mathcal{O}( \\sqrt{T} )$\nwhen the objective function is convex, which matches the best utility\nachievable by cutting-edge algorithms. Finally, numerical experiment results on\nboth synthetic and real-world datasets verify the effectiveness of our\nalgorithms.\n","authors":["Huqiang Cheng","Xiaofeng Liao","Huaqing Li"],"pdf_url":"https://arxiv.org/pdf/2206.07944v4.pdf","comment":"16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.11363v2","updated":"2023-08-03T16:55:34Z","published":"2023-06-20T08:02:59Z","title":"Masked Diffusion Models Are Fast and Privacy-Aware Learners","summary":"  Diffusion models have emerged as the \\emph{de-facto} technique for image\ngeneration, yet they entail significant computational overhead, hindering the\ntechnique's broader application in the research community. We propose a\nprior-based denoising training framework, the first to incorporate the\npre-train and fine-tune paradigm into the diffusion model training process,\nwhich substantially improves training efficiency and shows potential in\nfacilitating various downstream tasks. Our approach centers on masking a high\nproportion (e.g., up to 90\\%) of the input image and employing masked denoising\nscore matching to denoise the visible areas, thereby guiding the diffusion\nmodel to learn more salient features from training data as prior knowledge. By\nutilizing masked learning in a pre-training stage, we efficiently train the\nViT-based diffusion model on CelebA-HQ $256 \\times 256$ in the pixel space,\nachieving a 4x acceleration and enhancing the quality of generated images\ncompared to denoising diffusion probabilistic model (DDPM). Moreover, our\nmasked pre-training technique can be universally applied to various diffusion\nmodels that directly generate images in the pixel space, aiding in the learning\nof pre-trained models with superior generalizability. For instance, a diffusion\nmodel pre-trained on VGGFace2 attains a 46\\% quality improvement through\nfine-tuning with merely 10\\% data from a different distribution. Moreover, our\nmethod shows the potential to serve as a training paradigm for enhancing the\nprivacy protection capabilities of diffusion models. Our code is available at\n\\url{https://github.com/jiachenlei/maskdm}.\n","authors":["Jiachen Lei","Peng Cheng","Zhongjie Ba","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2306.11363v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02144v2","updated":"2023-08-03T16:42:56Z","published":"2022-11-03T21:03:23Z","title":"No Agreement Without Loss: Learning and Social Choice in Peer Review","summary":"  In peer review systems, reviewers are often asked to evaluate various\nfeatures of submissions, such as technical quality or novelty. A score is given\nto each of the predefined features and based on these the reviewer has to\nprovide an overall quantitative recommendation. It may be assumed that each\nreviewer has her own mapping from the set of features to a recommendation, and\nthat different reviewers have different mappings in mind. This introduces an\nelement of arbitrariness known as commensuration bias. In this paper we discuss\na framework, introduced by Noothigattu, Shah and Procaccia, and then applied by\nthe organizers of the AAAI 2022 conference. Noothigattu, Shah and Procaccia\nproposed to aggregate reviewer's mapping by minimizing certain loss functions,\nand studied axiomatic properties of this approach, in the sense of social\nchoice theory. We challenge several of the results and assumptions used in\ntheir work and report a number of negative results. On the one hand, we study a\ntrade-off between some of the axioms proposed and the ability of the method to\nproperly capture agreements of the majority of reviewers. On the other hand, we\nshow that dropping a certain unrealistic assumption has dramatic effects,\nincluding causing the method to be discontinuous.\n","authors":["Pablo Barceló","Mauricio Duarte","Cristóbal Rojas","Tomasz Steifer"],"pdf_url":"https://arxiv.org/pdf/2211.02144v2.pdf","comment":"accepted for ECAI 2023"},{"id":"http://arxiv.org/abs/2308.01853v1","updated":"2023-08-03T16:19:40Z","published":"2023-08-03T16:19:40Z","title":"Statistical Estimation Under Distribution Shift: Wasserstein\n  Perturbations and Minimax Theory","summary":"  Distribution shifts are a serious concern in modern statistical learning as\nthey can systematically change the properties of the data away from the truth.\nWe focus on Wasserstein distribution shifts, where every data point may undergo\na slight perturbation, as opposed to the Huber contamination model where a\nfraction of observations are outliers. We formulate and study shifts beyond\nindependent perturbations, exploring Joint Distribution Shifts, where the\nper-observation perturbations can be coordinated. We analyze several important\nstatistical problems, including location estimation, linear regression, and\nnon-parametric density estimation. Under a squared loss for mean estimation and\nprediction error in linear regression, we find the exact minimax risk, a least\nfavorable perturbation, and show that the sample mean and least squares\nestimators are respectively optimal. This holds for both independent and joint\nshifts, but the least favorable perturbations and minimax risks differ. For\nother problems, we provide nearly optimal estimators and precise finite-sample\nbounds. We also introduce several tools for bounding the minimax risk under\ndistribution shift, such as a smoothing technique for location families, and\ngeneralizations of classical tools including least favorable sequences of\npriors, the modulus of continuity, Le Cam's, Fano's, and Assouad's methods.\n","authors":["Patrick Chao","Edgar Dobriban"],"pdf_url":"https://arxiv.org/pdf/2308.01853v1.pdf","comment":"60 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.01849v1","updated":"2023-08-03T16:18:19Z","published":"2023-08-03T16:18:19Z","title":"Curricular Transfer Learning for Sentence Encoded Tasks","summary":"  Fine-tuning language models in a downstream task is the standard approach for\nmany state-of-the-art methodologies in the field of NLP. However, when the\ndistribution between the source task and target task drifts, \\textit{e.g.},\nconversational environments, these gains tend to be diminished. This article\nproposes a sequence of pre-training steps (a curriculum) guided by \"data\nhacking\" and grammar analysis that allows further gradual adaptation between\npre-training distributions. In our experiments, we acquire a considerable\nimprovement from our method compared to other known pre-training approaches for\nthe MultiWoZ task.\n","authors":["Jader Martins Camboim de Sá","Matheus Ferraroni Sanches","Rafael Roque de Souza","Júlio Cesar dos Reis","Leandro Aparecido Villas"],"pdf_url":"https://arxiv.org/pdf/2308.01849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04800v2","updated":"2023-08-03T16:11:29Z","published":"2023-05-08T15:54:18Z","title":"Mlinear: Rethink the Linear Model for Time-series Forecasting","summary":"  Recently, significant advancements have been made in time-series forecasting\nresearch, with an increasing focus on analyzing the nature of time-series data,\ne.g, channel-independence (CI) and channel-dependence (CD), rather than solely\nfocusing on designing sophisticated forecasting models. However, current\nresearch has primarily focused on either CI or CD in isolation, and the\nchallenge of effectively combining these two opposing properties to achieve a\nsynergistic effect remains an unresolved issue. In this paper, we carefully\nexamine the opposing properties of CI and CD, and raise a practical question\nthat has not been effectively answered, e.g.,\"How to effectively mix the CI and\nCD properties of time series to achieve better predictive performance?\" To\nanswer this question, we propose Mlinear (MIX-Linear), a simple yet effective\nmethod based mainly on linear layers. The design philosophy of Mlinear mainly\nincludes two aspects:(1) dynamically tuning the CI and CD properties based on\nthe time semantics of different input time series, and (2) providing deep\nsupervision to adjust the individual performance of the \"CI predictor\" and \"CD\npredictor\". In addition, empirically, we introduce a new loss function that\nsignificantly outperforms the widely used mean squared error (MSE) on multiple\ndatasets. Experiments on time-series datasets covering multiple fields and\nwidely used have demonstrated the superiority of our method over PatchTST which\nis the lateset Transformer-based method in terms of the MSE and MAE metrics on\n7 datasets with identical sequence inputs (336 or 512). Specifically, our\nmethod significantly outperforms PatchTST with a ratio of 21:3 at 336 sequence\nlength input and 29:10 at 512 sequence length input. Additionally, our approach\nhas a 10 $\\times$ efficiency advantage at the unit level, taking into account\nboth training and inference times.\n","authors":["Wei Li","Xiangxu Meng","Chuhao Chen","Jianing Chen"],"pdf_url":"https://arxiv.org/pdf/2305.04800v2.pdf","comment":"24 pages,4 figure,7 tables"},{"id":"http://arxiv.org/abs/2302.02096v2","updated":"2023-08-03T16:06:50Z","published":"2023-02-04T05:27:55Z","title":"Matrix Estimation for Individual Fairness","summary":"  In recent years, multiple notions of algorithmic fairness have arisen. One\nsuch notion is individual fairness (IF), which requires that individuals who\nare similar receive similar treatment. In parallel, matrix estimation (ME) has\nemerged as a natural paradigm for handling noisy data with missing values. In\nthis work, we connect the two concepts. We show that pre-processing data using\nME can improve an algorithm's IF without sacrificing performance. Specifically,\nwe show that using a popular ME method known as singular value thresholding\n(SVT) to pre-process the data provides a strong IF guarantee under appropriate\nconditions. We then show that, under analogous conditions, SVT pre-processing\nalso yields estimates that are consistent and approximately minimax optimal. As\nsuch, the ME pre-processing step does not, under the stated conditions,\nincrease the prediction error of the base algorithm, i.e., does not impose a\nfairness-performance trade-off. We verify these results on synthetic and real\ndata.\n","authors":["Cindy Y. Zhang","Sarah H. Cen","Devavrat Shah"],"pdf_url":"https://arxiv.org/pdf/2302.02096v2.pdf","comment":"23 pages, 3 figures, ICML 2023"},{"id":"http://arxiv.org/abs/2308.01840v1","updated":"2023-08-03T16:05:39Z","published":"2023-08-03T16:05:39Z","title":"URET: Universal Robustness Evaluation Toolkit (for Evasion)","summary":"  Machine learning models are known to be vulnerable to adversarial evasion\nattacks as illustrated by image classification models. Thoroughly understanding\nsuch attacks is critical in order to ensure the safety and robustness of\ncritical AI tasks. However, most evasion attacks are difficult to deploy\nagainst a majority of AI systems because they have focused on image domain with\nonly few constraints. An image is composed of homogeneous, numerical,\ncontinuous, and independent features, unlike many other input types to AI\nsystems used in practice. Furthermore, some input types include additional\nsemantic and functional constraints that must be observed to generate realistic\nadversarial inputs. In this work, we propose a new framework to enable the\ngeneration of adversarial inputs irrespective of the input type and task\ndomain. Given an input and a set of pre-defined input transformations, our\nframework discovers a sequence of transformations that result in a semantically\ncorrect and functional adversarial input. We demonstrate the generality of our\napproach on several diverse machine learning tasks with various input\nrepresentations. We also show the importance of generating adversarial examples\nas they enable the deployment of mitigation techniques.\n","authors":["Kevin Eykholt","Taesung Lee","Douglas Schales","Jiyong Jang","Ian Molloy","Masha Zorin"],"pdf_url":"https://arxiv.org/pdf/2308.01840v1.pdf","comment":"Accepted at USENIX '23"},{"id":"http://arxiv.org/abs/2303.10112v3","updated":"2023-08-03T16:04:48Z","published":"2023-03-17T16:45:01Z","title":"Causal Discovery from Temporal Data: An Overview and New Perspectives","summary":"  Temporal data, representing chronological observations of complex systems,\nhas always been a typical data structure that can be widely generated by many\ndomains, such as industry, medicine and finance. Analyzing this type of data is\nextremely valuable for various applications. Thus, different temporal data\nanalysis tasks, eg, classification, clustering and prediction, have been\nproposed in the past decades. Among them, causal discovery, learning the causal\nrelations from temporal data, is considered an interesting yet critical task\nand has attracted much research attention. Existing causal discovery works can\nbe divided into two highly correlated categories according to whether the\ntemporal data is calibrated, ie, multivariate time series causal discovery, and\nevent sequence causal discovery. However, most previous surveys are only\nfocused on the time series causal discovery and ignore the second category. In\nthis paper, we specify the correlation between the two categories and provide a\nsystematical overview of existing solutions. Furthermore, we provide public\ndatasets, evaluation metrics and new perspectives for temporal data causal\ndiscovery.\n","authors":["Chang Gong","Di Yao","Chuzhe Zhang","Wenbin Li","Jingping Bi"],"pdf_url":"https://arxiv.org/pdf/2303.10112v3.pdf","comment":"54 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.01834v1","updated":"2023-08-03T15:52:27Z","published":"2023-08-03T15:52:27Z","title":"The Capability of Large Language Models to Measure Psychiatric\n  Functioning","summary":"  The current work investigates the capability of Large language models (LLMs)\nthat are explicitly trained on large corpuses of medical knowledge (Med-PaLM 2)\nto predict psychiatric functioning from patient interviews and clinical\ndescriptions without being trained to do so. To assess this, n = 145 depression\nand n =115 PTSD assessments and n = 46 clinical case studies across high\nprevalence/high comorbidity disorders (Depressive, Anxiety, Psychotic, trauma\nand stress, Addictive disorders) were analyzed using prompts to extract\nestimated clinical scores and diagnoses. Results demonstrate that Med-PaLM 2 is\ncapable of assessing psychiatric functioning across a range of psychiatric\nconditions with the strongest performance being the prediction of depression\nscores based on standardized assessments (Accuracy range= 0.80 - 0.84) which\nwere statistically indistinguishable from human clinical raters t(1,144) =\n1.20; p = 0.23. Results show the potential for general clinical language models\nto flexibly predict psychiatric risk based on free descriptions of functioning\nfrom both patients and clinicians.\n","authors":["Isaac R. Galatzer-Levy","Daniel McDuff","Vivek Natarajan","Alan Karthikesalingam","Matteo Malgaroli"],"pdf_url":"https://arxiv.org/pdf/2308.01834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01835v1","updated":"2023-08-03T15:52:27Z","published":"2023-08-03T15:52:27Z","title":"Distribution-Free Inference for the Regression Function of Binary\n  Classification","summary":"  One of the key objects of binary classification is the regression function,\ni.e., the conditional expectation of the class labels given the inputs. With\nthe regression function not only a Bayes optimal classifier can be defined, but\nit also encodes the corresponding misclassification probabilities. The paper\npresents a resampling framework to construct exact, distribution-free and\nnon-asymptotically guaranteed confidence regions for the true regression\nfunction for any user-chosen confidence level. Then, specific algorithms are\nsuggested to demonstrate the framework. It is proved that the constructed\nconfidence regions are strongly consistent, that is, any false model is\nexcluded in the long run with probability one. The exclusion is quantified with\nprobably approximately correct type bounds, as well. Finally, the algorithms\nare validated via numerical experiments, and the methods are compared to\napproximate asymptotic confidence ellipsoids.\n","authors":["Ambrus Tamás","Balázs Csanád Csáji"],"pdf_url":"https://arxiv.org/pdf/2308.01835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01823v1","updated":"2023-08-03T15:33:24Z","published":"2023-08-03T15:33:24Z","title":"Hard Adversarial Example Mining for Improving Robust Fairness","summary":"  Adversarial training (AT) is widely considered the state-of-the-art technique\nfor improving the robustness of deep neural networks (DNNs) against adversarial\nexamples (AE). Nevertheless, recent studies have revealed that adversarially\ntrained models are prone to unfairness problems, restricting their\napplicability. In this paper, we empirically observe that this limitation may\nbe attributed to serious adversarial confidence overfitting, i.e., certain\nadversarial examples with overconfidence. To alleviate this problem, we propose\nHAM, a straightforward yet effective framework via adaptive Hard Adversarial\nexample Mining.HAM concentrates on mining hard adversarial examples while\ndiscarding the easy ones in an adaptive fashion. Specifically, HAM identifies\nhard AEs in terms of their step sizes needed to cross the decision boundary\nwhen calculating loss value. Besides, an early-dropping mechanism is\nincorporated to discard the easy examples at the initial stages of AE\ngeneration, resulting in efficient AT. Extensive experimental results on\nCIFAR-10, SVHN, and Imagenette demonstrate that HAM achieves significant\nimprovement in robust fairness while reducing computational cost compared to\nseveral state-of-the-art adversarial training methods. The code will be made\npublicly available.\n","authors":["Chenhao Lin","Xiang Ji","Yulong Yang","Qian Li","Chao Shen","Run Wang","Liming Fang"],"pdf_url":"https://arxiv.org/pdf/2308.01823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.14563v3","updated":"2023-08-03T15:27:10Z","published":"2020-12-29T01:51:59Z","title":"Random Planted Forest: a directly interpretable tree ensemble","summary":"  We introduce a novel interpretable tree based algorithm for prediction in a\nregression setting. Our motivation is to estimate the unknown regression\nfunction from a functional decomposition perspective in which the functional\ncomponents correspond to lower order interaction terms. The idea is to modify\nthe random forest algorithm by keeping certain leaves after they are split\ninstead of deleting them. This leads to non-binary trees which we refer to as\nplanted trees. An extension to a forest leads to our random planted forest\nalgorithm. Additionally, the maximum number of covariates which can interact\nwithin a leaf can be bounded. If we set this interaction bound to one, the\nresulting estimator is a sum of one-dimensional functions. In the other extreme\ncase, if we do not set a limit, the resulting estimator and corresponding model\nplace no restrictions on the form of the regression function. In a simulation\nstudy we find encouraging prediction and visualisation properties of our random\nplanted forest method. We also develop theory for an idealized version of\nrandom planted forests in cases where the interaction bound is low. We show\nthat if it is smaller than three, the idealized version achieves asymptotically\noptimal convergence rates up to a logarithmic factor. Code is available on\nGitHub https://github.com/PlantedML/randomPlantedForest.\n","authors":["Munir Hiabu","Enno Mammen","Joseph T. Meyer"],"pdf_url":"https://arxiv.org/pdf/2012.14563v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.03606v2","updated":"2023-08-03T15:25:51Z","published":"2023-02-02T20:03:21Z","title":"Merging satellite and gauge-measured precipitation using LightGBM with\n  an emphasis on extreme quantiles","summary":"  Knowing the actual precipitation in space and time is critical in\nhydrological modelling applications, yet the spatial coverage with rain gauge\nstations is limited due to economic constraints. Gridded satellite\nprecipitation datasets offer an alternative option for estimating the actual\nprecipitation by covering uniformly large areas, albeit related estimates are\nnot accurate. To improve precipitation estimates, machine learning is applied\nto merge rain gauge-based measurements and gridded satellite precipitation\nproducts. In this context, observed precipitation plays the role of the\ndependent variable, while satellite data play the role of predictor variables.\nRandom forests is the dominant machine learning algorithm in relevant\napplications. In those spatial predictions settings, point predictions (mostly\nthe mean or the median of the conditional distribution) of the dependent\nvariable are issued. The aim of the manuscript is to solve the problem of\nprobabilistic prediction of precipitation with an emphasis on extreme quantiles\nin spatial interpolation settings. Here we propose, issuing probabilistic\nspatial predictions of precipitation using Light Gradient Boosting Machine\n(LightGBM). LightGBM is a boosting algorithm, highlighted by prize-winning\nentries in prediction and forecasting competitions. To assess LightGBM, we\ncontribute a large-scale application that includes merging daily precipitation\nmeasurements in contiguous US with PERSIANN and GPM-IMERG satellite\nprecipitation data. We focus on extreme quantiles of the probability\ndistribution of the dependent variable, where LightGBM outperforms quantile\nregression forests (QRF, a variant of random forests) in terms of quantile\nscore at extreme quantiles. Our study offers understanding of probabilistic\npredictions in spatial settings using machine learning.\n","authors":["Hristos Tyralis","Georgia Papacharalampous","Nikolaos Doulamis","Anastasios Doulamis"],"pdf_url":"https://arxiv.org/pdf/2302.03606v2.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.01814v1","updated":"2023-08-03T15:22:51Z","published":"2023-08-03T15:22:51Z","title":"Tensor Programs IVb: Adaptive Optimization in the Infinite-Width Limit","summary":"  Going beyond stochastic gradient descent (SGD), what new phenomena emerge in\nwide neural networks trained by adaptive optimizers like Adam? Here we show:\nThe same dichotomy between feature learning and kernel behaviors (as in SGD)\nholds for general optimizers as well, including Adam -- albeit with a nonlinear\nnotion of \"kernel.\" We derive the corresponding \"neural tangent\" and \"maximal\nupdate\" limits for any architecture. Two foundational advances underlie the\nabove results: 1) A new Tensor Program language, NEXORT, that can express how\nadaptive optimizers process gradients into updates. 2) The introduction of\nbra-ket notation to drastically simplify expressions and calculations in Tensor\nPrograms. This work summarizes and generalizes all previous results in the\nTensor Programs series of papers.\n","authors":["Greg Yang","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2308.01814v1.pdf","comment":"This is the complete version of \"Adaptive Optimization in the\n  Infinite-Width Limit\" in ICLR 2023,\n  https://openreview.net/forum?id=zgVDqw9ZUES"},{"id":"http://arxiv.org/abs/2305.10406v2","updated":"2023-08-03T15:22:05Z","published":"2023-05-17T17:47:19Z","title":"Variational Classification","summary":"  We present a latent variable generalisation of neural network softmax\nclassification trained with cross-entropy loss, referred to as variational\nclassification (VC). Our approach offers a novel probabilistic perspective on\nthe highly familiar softmax classification model, to which it relates similarly\nto how variational and traditional autoencoders relate. We derive a training\nobjective based on the evidence lower bound (ELBO) that is non-trivial to\noptimize, and therefore propose an adversarial approach to maximise it. We show\nthat VC addresses an inherent inconsistency within softmax classification,\nwhilst also allowing more flexible choices of prior distributions in the latent\nspace in place of implicit assumptions revealed within off-the-shelf softmax\nclassifiers. Empirical evaluation on image and text classification datasets\ndemonstrates that variational classification maintains prediction accuracy\nwhile improving other desirable properties such as calibration and adversarial\nrobustness, particularly under distribution shift and low data settings.\n","authors":["Shehzaad Dhuliawala","Mrinmaya Sachan","Carl Allen"],"pdf_url":"https://arxiv.org/pdf/2305.10406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2003.08904v8","updated":"2023-08-03T14:59:19Z","published":"2020-03-19T17:05:51Z","title":"RAB: Provable Robustness Against Backdoor Attacks","summary":"  Recent studies have shown that deep neural networks (DNNs) are vulnerable to\nadversarial attacks, including evasion and backdoor (poisoning) attacks. On the\ndefense side, there have been intensive efforts on improving both empirical and\nprovable robustness against evasion attacks; however, the provable robustness\nagainst backdoor attacks still remains largely unexplored. In this paper, we\nfocus on certifying the machine learning model robustness against general\nthreat models, especially backdoor attacks. We first provide a unified\nframework via randomized smoothing techniques and show how it can be\ninstantiated to certify the robustness against both evasion and backdoor\nattacks. We then propose the first robust training process, RAB, to smooth the\ntrained model and certify its robustness against backdoor attacks. We prove the\nrobustness bound for machine learning models trained with RAB and prove that\nour robustness bound is tight. In addition, we theoretically show that it is\npossible to train the robust smoothed models efficiently for simple models such\nas K-nearest neighbor classifiers, and we propose an exact smooth-training\nalgorithm that eliminates the need to sample from a noise distribution for such\nmodels. Empirically, we conduct comprehensive experiments for different machine\nlearning (ML) models such as DNNs, support vector machines, and K-NN models on\nMNIST, CIFAR-10, and ImageNette datasets and provide the first benchmark for\ncertified robustness against backdoor attacks. In addition, we evaluate K-NN\nmodels on a spambase tabular dataset to demonstrate the advantages of the\nproposed exact algorithm. Both the theoretic analysis and the comprehensive\nevaluation on diverse ML models and datasets shed light on further robust\nlearning strategies against general training time attacks.\n","authors":["Maurice Weber","Xiaojun Xu","Bojan Karlaš","Ce Zhang","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2003.08904v8.pdf","comment":"IEEE Symposium on Security and Privacy 2023"},{"id":"http://arxiv.org/abs/2308.01797v1","updated":"2023-08-03T14:52:17Z","published":"2023-08-03T14:52:17Z","title":"Job Shop Scheduling via Deep Reinforcement Learning: a Sequence to\n  Sequence approach","summary":"  Job scheduling is a well-known Combinatorial Optimization problem with\nendless applications. Well planned schedules bring many benefits in the context\nof automated systems: among others, they limit production costs and waste.\nNevertheless, the NP-hardness of this problem makes it essential to use\nheuristics whose design is difficult, requires specialized knowledge and often\nproduces methods tailored to the specific task. This paper presents an original\nend-to-end Deep Reinforcement Learning approach to scheduling that\nautomatically learns dispatching rules. Our technique is inspired by natural\nlanguage encoder-decoder models for sequence processing and has never been\nused, to the best of our knowledge, for scheduling purposes. We applied and\ntested our method in particular to some benchmark instances of Job Shop\nProblem, but this technique is general enough to be potentially used to tackle\nother different optimal job scheduling tasks with minimal intervention. Results\ndemonstrate that we outperform many classical approaches exploiting priority\ndispatching rules and show competitive results on state-of-the-art Deep\nReinforcement Learning ones.\n","authors":["Giovanni Bonetta","Davide Zago","Rossella Cancelliere","Andrea Grosso"],"pdf_url":"https://arxiv.org/pdf/2308.01797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01789v1","updated":"2023-08-03T14:39:02Z","published":"2023-08-03T14:39:02Z","title":"Benchmarking Adaptative Variational Quantum Algorithms on QUBO Instances","summary":"  In recent years, Variational Quantum Algorithms (VQAs) have emerged as a\npromising approach for solving optimization problems on quantum computers in\nthe NISQ era. However, one limitation of VQAs is their reliance on\nfixed-structure circuits, which may not be taylored for specific problems or\nhardware configurations. A leading strategy to address this issue are\nAdaptative VQAs, which dynamically modify the circuit structure by adding and\nremoving gates, and optimize their parameters during the training. Several\nAdaptative VQAs, based on heuristics such as circuit shallowness, entanglement\ncapability and hardware compatibility, have already been proposed in the\nliterature, but there is still lack of a systematic comparison between the\ndifferent methods. In this paper, we aim to fill this gap by analyzing three\nAdaptative VQAs: Evolutionary Variational Quantum Eigensolver (EVQE), Variable\nAnsatz (VAns), already proposed in the literature, and Random Adapt-VQE\n(RA-VQE), a random approach we introduce as a baseline. In order to compare\nthese algorithms to traditional VQAs, we also include the Quantum Approximate\nOptimization Algorithm (QAOA) in our analysis. We apply these algorithms to\nQUBO problems and study their performance by examining the quality of the\nsolutions found and the computational times required. Additionally, we\ninvestigate how the choice of the hyperparameters can impact the overall\nperformance of the algorithms, highlighting the importance of selecting an\nappropriate methodology for hyperparameter tuning. Our analysis sets benchmarks\nfor Adaptative VQAs designed for near-term quantum devices and provides\nvaluable insights to guide future research in this area.\n","authors":["Gloria Turati","Maurizio Ferrari Dacrema","Paolo Cremonesi"],"pdf_url":"https://arxiv.org/pdf/2308.01789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.15402v3","updated":"2023-08-03T14:00:42Z","published":"2021-12-31T12:05:22Z","title":"Relational Experience Replay: Continual Learning by Adaptively Tuning\n  Task-wise Relationship","summary":"  Continual learning is a promising machine learning paradigm to learn new\ntasks while retaining previously learned knowledge over streaming training\ndata. Till now, rehearsal-based methods, keeping a small part of data from old\ntasks as a memory buffer, have shown good performance in mitigating\ncatastrophic forgetting for previously learned knowledge. However, most of\nthese methods typically treat each new task equally, which may not adequately\nconsider the relationship or similarity between old and new tasks. Furthermore,\nthese methods commonly neglect sample importance in the continual training\nprocess and result in sub-optimal performance on certain tasks. To address this\nchallenging problem, we propose Relational Experience Replay (RER), a bi-level\nlearning framework, to adaptively tune task-wise relationships and sample\nimportance within each task to achieve a better `stability' and `plasticity'\ntrade-off. As such, the proposed method is capable of accumulating new\nknowledge while consolidating previously learned old knowledge during continual\nlearning. Extensive experiments conducted on three publicly available datasets\n(i.e., CIFAR-10, CIFAR-100, and Tiny ImageNet) show that the proposed method\ncan consistently improve the performance of all baselines and surpass current\nstate-of-the-art methods.\n","authors":["Quanziang Wang","Renzhen Wang","Yuexiang Li","Dong Wei","Kai Ma","Yefeng Zheng","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2112.15402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01771v1","updated":"2023-08-03T14:00:01Z","published":"2023-08-03T14:00:01Z","title":"Deep Learning-based Prediction of Stress and Strain Maps in Arterial\n  Walls for Improved Cardiovascular Risk Assessment","summary":"  This study investigated the potential of end-to-end deep learning tools as a\nmore effective substitute for FEM in predicting stress-strain fields within 2D\ncross sections of arterial wall. We first proposed a U-Net based fully\nconvolutional neural network (CNN) to predict the von Mises stress and strain\ndistribution based on the spatial arrangement of calcification within arterial\nwall cross-sections. Further, we developed a conditional generative adversarial\nnetwork (cGAN) to enhance, particularly from the perceptual perspective, the\nprediction accuracy of stress and strain field maps for arterial walls with\nvarious calcification quantities and spatial configurations. On top of U-Net\nand cGAN, we also proposed their ensemble approaches, respectively, to further\nimprove the prediction accuracy of field maps. Our dataset, consisting of input\nand output images, was generated by implementing boundary conditions and\nextracting stress-strain field maps. The trained U-Net models can accurately\npredict von Mises stress and strain fields, with structural similarity index\nscores (SSIM) of 0.854 and 0.830 and mean squared errors of 0.017 and 0.018 for\nstress and strain, respectively, on a reserved test set. Meanwhile, the cGAN\nmodels in a combination of ensemble and transfer learning techniques\ndemonstrate high accuracy in predicting von Mises stress and strain fields, as\nevidenced by SSIM scores of 0.890 for stress and 0.803 for strain.\nAdditionally, mean squared errors of 0.008 for stress and 0.017 for strain\nfurther support the model's performance on a designated test set. Overall, this\nstudy developed a surrogate model for finite element analysis, which can\naccurately and efficiently predict stress-strain fields of arterial walls\nregardless of complex geometries and boundary conditions.\n","authors":["Yasin Shokrollahi1","Pengfei Dong1","Xianqi Li","Linxia Gu"],"pdf_url":"https://arxiv.org/pdf/2308.01771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16174v2","updated":"2023-08-03T13:46:09Z","published":"2023-05-25T15:33:19Z","title":"From Latent Graph to Latent Topology Inference: Differentiable Cell\n  Complex Module","summary":"  Latent Graph Inference (LGI) relaxed the reliance of Graph Neural Networks\n(GNNs) on a given graph topology by dynamically learning it. However, most of\nLGI methods assume to have a (noisy, incomplete, improvable, ...) input graph\nto rewire and can solely learn regular graph topologies. In the wake of the\nsuccess of Topological Deep Learning (TDL), we study Latent Topology Inference\n(LTI) for learning higher-order cell complexes (with sparse and not regular\ntopology) describing multi-way interactions between data points. To this aim,\nwe introduce the Differentiable Cell Complex Module (DCM), a novel learnable\nfunction that computes cell probabilities in the complex to improve the\ndownstream task. We show how to integrate DCM with cell complex message passing\nnetworks layers and train it in a end-to-end fashion, thanks to a two-step\ninference procedure that avoids an exhaustive search across all possible cells\nin the input, thus maintaining scalability. Our model is tested on several\nhomophilic and heterophilic graph datasets and it is shown to outperform other\nstate-of-the-art techniques, offering significant improvements especially in\ncases where an input graph is not provided.\n","authors":["Claudio Battiloro","Indro Spinelli","Lev Telyatnikov","Michael Bronstein","Simone Scardapane","Paolo Di Lorenzo"],"pdf_url":"https://arxiv.org/pdf/2305.16174v2.pdf","comment":"Under review. 17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.01759v1","updated":"2023-08-03T13:43:03Z","published":"2023-08-03T13:43:03Z","title":"Bag of Policies for Distributional Deep Exploration","summary":"  Efficient exploration in complex environments remains a major challenge for\nreinforcement learning (RL). Compared to previous Thompson sampling-inspired\nmechanisms that enable temporally extended exploration, i.e., deep exploration,\nwe focus on deep exploration in distributional RL. We develop here a general\npurpose approach, Bag of Policies (BoP), that can be built on top of any return\ndistribution estimator by maintaining a population of its copies. BoP consists\nof an ensemble of multiple heads that are updated independently. During\ntraining, each episode is controlled by only one of the heads and the collected\nstate-action pairs are used to update all heads off-policy, leading to distinct\nlearning signals for each head which diversify learning and behaviour. To test\nwhether optimistic ensemble method can improve on distributional RL as did on\nscalar RL, by e.g. Bootstrapped DQN, we implement the BoP approach with a\npopulation of distributional actor-critics using Bayesian Distributional Policy\nGradients (BDPG). The population thus approximates a posterior distribution of\nreturn distributions along with a posterior distribution of policies. Another\nbenefit of building upon BDPG is that it allows to analyze global posterior\nuncertainty along with local curiosity bonus simultaneously for exploration. As\nBDPG is already an optimistic method, this pairing helps to investigate if\noptimism is accumulatable in distributional RL. Overall BoP results in greater\nrobustness and speed during learning as demonstrated by our experimental\nresults on ALE Atari games.\n","authors":["Asen Nachkov","Luchen Li","Giulia Luise","Filippo Valdettaro","Aldo Faisal"],"pdf_url":"https://arxiv.org/pdf/2308.01759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03188v2","updated":"2023-08-03T13:41:59Z","published":"2022-12-06T18:02:55Z","title":"An Unsupervised Machine Learning Approach for Ground-Motion Spectra\n  Clustering and Selection","summary":"  Clustering analysis of sequence data continues to address many applications\nin engineering design, aided with the rapid growth of machine learning in\napplied science. This paper presents an unsupervised machine learning algorithm\nto extract defining characteristics of earthquake ground-motion spectra, also\ncalled latent features, to aid in ground-motion selection (GMS). In this\ncontext, a latent feature is a low-dimensional machine-discovered spectral\ncharacteristic learned through nonlinear relationships of a neural network\nautoencoder. Machine discovered latent features can be combined with\ntraditionally defined intensity measures and clustering can be performed to\nselect a representative subgroup from a large ground-motion suite. The\nobjective of efficient GMS is to choose characteristic records representative\nof what the structure will probabilistically experience in its lifetime. Three\nexamples are presented to validate this approach, including the use of\nsynthetic and field recorded ground-motion datasets. The presented deep\nembedding clustering of ground-motion spectra has three main advantages: 1.\ndefining characteristics the represent the sparse spectral content of\nground-motions are discovered efficiently through training of the autoencoder,\n2. domain knowledge is incorporated into the machine learning framework with\nconditional variables in the deep embedding scheme, and 3. method exhibits\nexcellent performance when compared to a benchmark seismic hazard analysis.\n","authors":["R. Bailey Bond","Pu Ren","Jerome F. Hajjar","Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2212.03188v2.pdf","comment":"24 pages, 16 Figures"},{"id":"http://arxiv.org/abs/2305.19569v4","updated":"2023-08-03T13:15:12Z","published":"2023-05-31T05:37:17Z","title":"Domain knowledge-informed Synthetic fault sample generation with Health\n  Data Map for cross-domain Planetary Gearbox Fault Diagnosis","summary":"  Extensive research has been conducted on fault diagnosis of planetary\ngearboxes using vibration signals and deep learning (DL) approaches. However,\nDL-based methods are susceptible to the domain shift problem caused by varying\noperating conditions of the gearbox. Although domain adaptation and data\nsynthesis methods have been proposed to overcome such domain shifts, they are\noften not directly applicable in real-world situations where only healthy data\nis available in the target domain. To tackle the challenge of extreme domain\nshift scenarios where only healthy data is available in the target domain, this\npaper proposes two novel domain knowledge-informed data synthesis methods\nutilizing the health data map (HDMap). The two proposed approaches are referred\nto as scaled CutPaste and FaultPaste. The HDMap is used to physically represent\nthe vibration signal of the planetary gearbox as an image-like matrix, allowing\nfor visualization of fault-related features. CutPaste and FaultPaste are then\napplied to generate faulty samples based on the healthy data in the target\ndomain, using domain knowledge and fault signatures extracted from the source\ndomain, respectively. In addition to generating realistic faults, the proposed\nmethods introduce scaling of fault signatures for controlled synthesis of\nfaults with various severity levels. A case study is conducted on a planetary\ngearbox testbed to evaluate the proposed approaches. The results show that the\nproposed methods are capable of accurately diagnosing faults, even in cases of\nextreme domain shift, and can estimate the severity of faults that have not\nbeen previously observed in the target domain.\n","authors":["Jong Moon Ha","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2305.19569v4.pdf","comment":"Under review / added arXiv identifier / Updated to revised version"},{"id":"http://arxiv.org/abs/2308.01746v1","updated":"2023-08-03T13:09:59Z","published":"2023-08-03T13:09:59Z","title":"Neural Collapse Terminus: A Unified Solution for Class Incremental\n  Learning and Its Variants","summary":"  How to enable learnability for new classes while keeping the capability well\non old classes has been a crucial challenge for class incremental learning.\nBeyond the normal case, long-tail class incremental learning and few-shot class\nincremental learning are also proposed to consider the data imbalance and data\nscarcity, respectively, which are common in real-world implementations and\nfurther exacerbate the well-known problem of catastrophic forgetting. Existing\nmethods are specifically proposed for one of the three tasks. In this paper, we\noffer a unified solution to the misalignment dilemma in the three tasks.\nConcretely, we propose neural collapse terminus that is a fixed structure with\nthe maximal equiangular inter-class separation for the whole label space. It\nserves as a consistent target throughout the incremental training to avoid\ndividing the feature space incrementally. For CIL and LTCIL, we further propose\na prototype evolving scheme to drive the backbone features into our neural\ncollapse terminus smoothly. Our method also works for FSCIL with only minor\nadaptations. Theoretical analysis indicates that our method holds the neural\ncollapse optimality in an incremental fashion regardless of data imbalance or\ndata scarcity. We also design a generalized case where we do not know the total\nnumber of classes and whether the data distribution is normal, long-tail, or\nfew-shot for each coming session, to test the generalizability of our method.\nExtensive experiments with multiple datasets are conducted to demonstrate the\neffectiveness of our unified solution to all the three tasks and the\ngeneralized case.\n","authors":["Yibo Yang","Haobo Yuan","Xiangtai Li","Jianlong Wu","Lefei Zhang","Zhouchen Lin","Philip Torr","Dacheng Tao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2308.01746v1.pdf","comment":"An extension of our ICLR 2023 paper\n  https://openreview.net/pdf?id=y5W8tpojhtJ. arXiv admin note: text overlap\n  with arXiv:2302.03004"},{"id":"http://arxiv.org/abs/2308.01744v1","updated":"2023-08-03T13:08:09Z","published":"2023-08-03T13:08:09Z","title":"Multitask Learning with No Regret: from Improved Confidence Bounds to\n  Active Learning","summary":"  Multitask learning is a powerful framework that enables one to simultaneously\nlearn multiple related tasks by sharing information between them. Quantifying\nuncertainty in the estimated tasks is of pivotal importance for many downstream\napplications, such as online or active learning. In this work, we provide novel\nmultitask confidence intervals in the challenging agnostic setting, i.e., when\nneither the similarity between tasks nor the tasks' features are available to\nthe learner. The obtained intervals do not require i.i.d. data and can be\ndirectly applied to bound the regret in online learning. Through a refined\nanalysis of the multitask information gain, we obtain new regret guarantees\nthat, depending on a task similarity parameter, can significantly improve over\ntreating tasks independently. We further propose a novel online learning\nalgorithm that achieves such improved regret without knowing this parameter in\nadvance, i.e., automatically adapting to task similarity. As a second key\napplication of our results, we introduce a novel multitask active learning\nsetup where several tasks must be simultaneously optimized, but only one of\nthem can be queried for feedback by the learner at each round. For this\nproblem, we design a no-regret algorithm that uses our confidence intervals to\ndecide which task should be queried. Finally, we empirically validate our\nbounds and algorithms on synthetic and real-world (drug discovery) data.\n","authors":["Pier Giuseppe Sessa","Pierre Laforgue","Nicolò Cesa-Bianchi","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2308.01744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01743v1","updated":"2023-08-03T13:07:46Z","published":"2023-08-03T13:07:46Z","title":"Finding the Optimum Design of Large Gas Engines Prechambers Using CFD\n  and Bayesian Optimization","summary":"  The turbulent jet ignition concept using prechambers is a promising solution\nto achieve stable combustion at lean conditions in large gas engines, leading\nto high efficiency at low emission levels. Due to the wide range of design and\noperating parameters for large gas engine prechambers, the preferred method for\nevaluating different designs is computational fluid dynamics (CFD), as testing\nin test bed measurement campaigns is time-consuming and expensive. However, the\nsignificant computational time required for detailed CFD simulations due to the\ncomplexity of solving the underlying physics also limits its applicability. In\noptimization settings similar to the present case, i.e., where the evaluation\nof the objective function(s) is computationally costly, Bayesian optimization\nhas largely replaced classical design-of-experiment. Thus, the present study\ndeals with the computationally efficient Bayesian optimization of large gas\nengine prechambers design using CFD simulation. Reynolds-averaged-Navier-Stokes\nsimulations are used to determine the target values as a function of the\nselected prechamber design parameters. The results indicate that the chosen\nstrategy is effective to find a prechamber design that achieves the desired\ntarget values.\n","authors":["Stefan Posch","Clemens Gößnitzer","Franz Rohrhofer","Bernhard C. Geiger","Andreas Wimmer"],"pdf_url":"https://arxiv.org/pdf/2308.01743v1.pdf","comment":"9 pages. Part of Scientific Computing 2023 Conference Proceedings\n  (ISBN e-Book: 978-3-903318-20-5)"},{"id":"http://arxiv.org/abs/2308.01742v1","updated":"2023-08-03T13:06:45Z","published":"2023-08-03T13:06:45Z","title":"Exploiting Multi-Label Correlation in Label Distribution Learning","summary":"  Label Distribution Learning (LDL) is a novel machine learning paradigm that\nassigns label distribution to each instance. Many LDL methods proposed to\nleverage label correlation in the learning process to solve the\nexponential-sized output space; among these, many exploited the low-rank\nstructure of label distribution to capture label correlation. However, recent\nstudies disclosed that label distribution matrices are typically full-rank,\nposing challenges to those works exploiting low-rank label correlation. Note\nthat multi-label is generally low-rank; low-rank label correlation is widely\nadopted in multi-label learning (MLL) literature. Inspired by that, we\nintroduce an auxiliary MLL process in LDL and capture low-rank label\ncorrelation on that MLL rather than LDL. In such a way, low-rank label\ncorrelation is appropriately exploited in our LDL methods. We conduct\ncomprehensive experiments and demonstrate that our methods are superior to\nexisting LDL methods. Besides, the ablation studies justify the advantages of\nexploiting low-rank label correlation in the auxiliary MLL.\n","authors":["Zhiqiang Kou jing wang yuheng jia xin geng"],"pdf_url":"https://arxiv.org/pdf/2308.01742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08875v2","updated":"2023-08-03T12:59:57Z","published":"2023-02-17T13:44:47Z","title":"Optimal Training of Mean Variance Estimation Neural Networks","summary":"  This paper focusses on the optimal implementation of a Mean Variance\nEstimation network (MVE network) (Nix and Weigend, 1994). This type of network\nis often used as a building block for uncertainty estimation methods in a\nregression setting, for instance Concrete dropout (Gal et al., 2017) and Deep\nEnsembles (Lakshminarayanan et al., 2017). Specifically, an MVE network assumes\nthat the data is produced from a normal distribution with a mean function and\nvariance function. The MVE network outputs a mean and variance estimate and\noptimizes the network parameters by minimizing the negative loglikelihood. In\nour paper, we present two significant insights. Firstly, the convergence\ndifficulties reported in recent work can be relatively easily prevented by\nfollowing the simple yet often overlooked recommendation from the original\nauthors that a warm-up period should be used. During this period, only the mean\nis optimized with a fixed variance. We demonstrate the effectiveness of this\nstep through experimentation, highlighting that it should be standard practice.\nAs a sidenote, we examine whether, after the warm-up, it is beneficial to fix\nthe mean while optimizing the variance or to optimize both simultaneously.\nHere, we do not observe a substantial difference. Secondly, we introduce a\nnovel improvement of the MVE network: separate regularization of the mean and\nthe variance estimate. We demonstrate, both on toy examples and on a number of\nbenchmark UCI regression data sets, that following the original recommendations\nand the novel separate regularization can lead to significant improvements.\n","authors":["Laurens Sluijterman","Eric Cator","Tom Heskes"],"pdf_url":"https://arxiv.org/pdf/2302.08875v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.01737v1","updated":"2023-08-03T12:55:55Z","published":"2023-08-03T12:55:55Z","title":"MAP: A Model-agnostic Pretraining Framework for Click-through Rate\n  Prediction","summary":"  With the widespread application of personalized online services,\nclick-through rate (CTR) prediction has received more and more attention and\nresearch. The most prominent features of CTR prediction are its multi-field\ncategorical data format, and vast and daily-growing data volume. The large\ncapacity of neural models helps digest such massive amounts of data under the\nsupervised learning paradigm, yet they fail to utilize the substantial data to\nits full potential, since the 1-bit click signal is not sufficient to guide the\nmodel to learn capable representations of features and instances. The\nself-supervised learning paradigm provides a more promising pretrain-finetune\nsolution to better exploit the large amount of user click logs, and learn more\ngeneralized and effective representations. However, self-supervised learning\nfor CTR prediction is still an open question, since current works on this line\nare only preliminary and rudimentary. To this end, we propose a Model-agnostic\npretraining (MAP) framework that applies feature corruption and recovery on\nmulti-field categorical data, and more specifically, we derive two practical\nalgorithms: masked feature prediction (MFP) and replaced feature detection\n(RFD). MFP digs into feature interactions within each instance through masking\nand predicting a small portion of input features, and introduces noise\ncontrastive estimation (NCE) to handle large feature spaces. RFD further turns\nMFP into a binary classification mode through replacing and detecting changes\nin input features, making it even simpler and more effective for CTR\npretraining. Our extensive experiments on two real-world large-scale datasets\n(i.e., Avazu, Criteo) demonstrate the advantages of these two methods on\nseveral strong backbones (e.g., DCNv2, DeepFM), and achieve new\nstate-of-the-art performance in terms of both effectiveness and efficiency for\nCTR prediction.\n","authors":["Jianghao Lin","Yanru Qu","Wei Guo","Xinyi Dai","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.01737v1.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2106.03395v2","updated":"2023-08-03T12:53:40Z","published":"2021-06-07T07:47:46Z","title":"How to Evaluate Uncertainty Estimates in Machine Learning for\n  Regression?","summary":"  As neural networks become more popular, the need for accompanying uncertainty\nestimates increases. There are currently two main approaches to test the\nquality of these estimates. Most methods output a density. They can be compared\nby evaluating their loglikelihood on a test set. Other methods output a\nprediction interval directly. These methods are often tested by examining the\nfraction of test points that fall inside the corresponding prediction\nintervals. Intuitively both approaches seem logical. However, we demonstrate\nthrough both theoretical arguments and simulations that both ways of evaluating\nthe quality of uncertainty estimates have serious flaws. Firstly, both\napproaches cannot disentangle the separate components that jointly create the\npredictive uncertainty, making it difficult to evaluate the quality of the\nestimates of these components. Secondly, a better loglikelihood does not\nguarantee better prediction intervals, which is what the methods are often used\nfor in practice. Moreover, the current approach to test prediction intervals\ndirectly has additional flaws. We show why it is fundamentally flawed to test a\nprediction or confidence interval on a single test set. At best, marginal\ncoverage is measured, implicitly averaging out overconfident and underconfident\npredictions. A much more desirable property is pointwise coverage, requiring\nthe correct coverage for each prediction. We demonstrate through practical\nexamples that these effects can result in favoring a method, based on the\npredictive uncertainty, that has undesirable behaviour of the confidence or\nprediction intervals. Finally, we propose a simulation-based testing approach\nthat addresses these problems while still allowing easy comparison between\ndifferent methods.\n","authors":["Laurens Sluijterman","Eric Cator","Tom Heskes"],"pdf_url":"https://arxiv.org/pdf/2106.03395v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.01731v1","updated":"2023-08-03T12:43:21Z","published":"2023-08-03T12:43:21Z","title":"Quantification of Predictive Uncertainty via Inference-Time Sampling","summary":"  Predictive variability due to data ambiguities has typically been addressed\nvia construction of dedicated models with built-in probabilistic capabilities\nthat are trained to predict uncertainty estimates as variables of interest.\nThese approaches require distinct architectural components and training\nmechanisms, may include restrictive assumptions and exhibit overconfidence,\ni.e., high confidence in imprecise predictions. In this work, we propose a\npost-hoc sampling strategy for estimating predictive uncertainty accounting for\ndata ambiguity. The method can generate different plausible outputs for a given\ninput and does not assume parametric forms of predictive distributions. It is\narchitecture agnostic and can be applied to any feed-forward deterministic\nnetwork without changes to the architecture or training procedure. Experiments\non regression tasks on imaging and non-imaging input data show the method's\nability to generate diverse and multi-modal predictive distributions, and a\ndesirable correlation of the estimated uncertainty with the prediction error.\n","authors":["Katarína Tóthová","Ľubor Ladický","Daniel Thul","Marc Pollefeys","Ender Konukoglu"],"pdf_url":"https://arxiv.org/pdf/2308.01731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01729v1","updated":"2023-08-03T12:40:44Z","published":"2023-08-03T12:40:44Z","title":"Telematics Combined Actuarial Neural Networks for Cross-Sectional and\n  Longitudinal Claim Count Data","summary":"  We present novel cross-sectional and longitudinal claim count models for\nvehicle insurance built upon the Combined Actuarial Neural Network (CANN)\nframework proposed by Mario W\\\"uthrich and Michael Merz. The CANN approach\ncombines a classical actuarial model, such as a generalized linear model, with\na neural network. This blending of models results in a two-component model\ncomprising a classical regression model and a neural network part. The CANN\nmodel leverages the strengths of both components, providing a solid foundation\nand interpretability from the classical model while harnessing the flexibility\nand capacity to capture intricate relationships and interactions offered by the\nneural network. In our proposed models, we use well-known log-linear claim\ncount regression models for the classical regression part and a multilayer\nperceptron (MLP) for the neural network part. The MLP part is used to process\ntelematics car driving data given as a vector characterizing the driving\nbehavior of each insured driver. In addition to the Poisson and negative\nbinomial distributions for cross-sectional data, we propose a procedure for\ntraining our CANN model with a multivariate negative binomial (MVNB)\nspecification. By doing so, we introduce a longitudinal model that accounts for\nthe dependence between contracts from the same insured. Our results reveal that\nthe CANN models exhibit superior performance compared to log-linear models that\nrely on manually engineered telematics features.\n","authors":["Francis Duval","Jean-Philippe Boucher","Mathieu Pigeon"],"pdf_url":"https://arxiv.org/pdf/2308.01729v1.pdf","comment":"30 pages, 10 tables, 6 figures"},{"id":"http://arxiv.org/abs/2202.10903v2","updated":"2023-08-03T12:28:47Z","published":"2022-02-22T14:08:24Z","title":"Confident Neural Network Regression with Bootstrapped Deep Ensembles","summary":"  With the rise of the popularity and usage of neural networks, trustworthy\nuncertainty estimation is becoming increasingly essential. One of the most\nprominent uncertainty estimation methods is Deep Ensembles (Lakshminarayanan et\nal., 2017) . A classical parametric model has uncertainty in the parameters due\nto the fact that the data on which the model is build is a random sample. A\nmodern neural network has an additional uncertainty component since the\noptimization of the network is random. Lakshminarayanan et al. (2017) noted\nthat Deep Ensembles do not incorporate the classical uncertainty induced by the\neffect of finite data. In this paper, we present a computationally cheap\nextension of Deep Ensembles for the regression setting, called Bootstrapped\nDeep Ensembles, that explicitly takes this classical effect of finite data into\naccount using a modified version of the parametric bootstrap. We demonstrate\nthrough an experimental study that our method significantly improves upon\nstandard Deep Ensembles\n","authors":["Laurens Sluijterman","Eric Cator","Tom Heskes"],"pdf_url":"https://arxiv.org/pdf/2202.10903v2.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2207.10276v4","updated":"2023-08-03T12:20:15Z","published":"2022-07-21T03:01:04Z","title":"ProMix: Combating Label Noise via Maximizing Clean Sample Utility","summary":"  Learning with Noisy Labels (LNL) has become an appealing topic, as\nimperfectly annotated data are relatively cheaper to obtain. Recent\nstate-of-the-art approaches employ specific selection mechanisms to separate\nclean and noisy samples and then apply Semi-Supervised Learning (SSL)\ntechniques for improved performance. However, the selection step mostly\nprovides a medium-sized and decent-enough clean subset, which overlooks a rich\nset of clean samples. To fulfill this, we propose a novel LNL framework ProMix\nthat attempts to maximize the utility of clean samples for boosted performance.\nKey to our method, we propose a matched high confidence selection technique\nthat selects those examples with high confidence scores and matched predictions\nwith given labels to dynamically expand a base clean sample set. To overcome\nthe potential side effect of excessive clean set selection procedure, we\nfurther devise a novel SSL framework that is able to train balanced and\nunbiased classifiers on the separated clean and noisy samples. Extensive\nexperiments demonstrate that ProMix significantly advances the current\nstate-of-the-art results on multiple benchmarks with different types and levels\nof noise. It achieves an average improvement of 2.48\\% on the CIFAR-N dataset.\nThe code is available at https://github.com/Justherozen/ProMix\n","authors":["Ruixuan Xiao","Yiwen Dong","Haobo Wang","Lei Feng","Runze Wu","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2207.10276v4.pdf","comment":"Accepted to IJCAI 2023; A previous version won the 1st LMNL Challenge\n  in IJCAI 2022"},{"id":"http://arxiv.org/abs/2305.00605v2","updated":"2023-08-03T12:04:46Z","published":"2023-05-01T00:00:07Z","title":"Classification and Online Clustering of Zero-Day Malware","summary":"  A large amount of new malware is constantly being generated, which must not\nonly be distinguished from benign samples, but also classified into malware\nfamilies. For this purpose, investigating how existing malware families are\ndeveloped and examining emerging families need to be explored. This paper\nfocuses on the online processing of incoming malicious samples to assign them\nto existing families or, in the case of samples from new families, to cluster\nthem. We experimented with seven prevalent malware families from the EMBER\ndataset, four in the training set and three additional new families in the test\nset. Based on the classification score of the multilayer perceptron, we\ndetermined which samples would be classified and which would be clustered into\nnew malware families. We classified 97.21% of streaming data with a balanced\naccuracy of 95.33%. Then, we clustered the remaining data using a\nself-organizing map, achieving a purity from 47.61% for four clusters to 77.68%\nfor ten clusters. These results indicate that our approach has the potential to\nbe applied to the classification and clustering of zero-day malware into\nmalware families.\n","authors":["Olha Jurečková","Martin Jureček","Mark Stamp","Fabio Di Troia","Róbert Lórencz"],"pdf_url":"https://arxiv.org/pdf/2305.00605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08757v2","updated":"2023-08-03T12:00:48Z","published":"2023-03-15T16:53:19Z","title":"CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in\n  Patient With Suspected Ischemic Stroke","summary":"  Precise and fast prediction methods for ischemic areas comprised of dead\ntissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)\npatients are of significant clinical interest. They play an essential role in\nimproving diagnosis and treatment planning. Computed Tomography (CT) scan is\none of the primary modalities for early assessment in patients with suspected\nAIS. CT Perfusion (CTP) is often used as a primary assessment to determine\nstroke location, severity, and volume of ischemic lesions. Current automatic\nsegmentation methods for CTP mostly use already processed 3D parametric maps\nconventionally used for clinical interpretation by radiologists as input.\nAlternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time\ninput, where the spatial information over the volume is ignored. In addition,\nthese methods are only interested in segmenting core regions, while predicting\npenumbra can be essential for treatment planning. This paper investigates\ndifferent methods to utilize the entire 4D CTP as input to fully exploit the\nspatio-temporal information, leading us to propose a novel 4D convolution\nlayer. Our comprehensive experiments on a local dataset of 152 patients divided\ninto three groups show that our proposed models generate more precise results\nthan other methods explored. Adopting the proposed 4D mJ-Net, a Dice\nCoefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core\nareas, respectively. The code is available on\nhttps://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.\n","authors":["Luca Tomasetti","Kjersti Engan","Liv Jorunn Høllesli","Kathinka Dæhli Kurz","Mahdieh Khanmohammadi"],"pdf_url":"https://arxiv.org/pdf/2303.08757v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08736v3","updated":"2023-08-03T11:58:08Z","published":"2022-12-16T22:18:48Z","title":"A Neural Network Warm-Start Approach for the Inverse Acoustic Obstacle\n  Scattering Problem","summary":"  We consider the inverse acoustic obstacle problem for sound-soft star-shaped\nobstacles in two dimensions wherein the boundary of the obstacle is determined\nfrom measurements of the scattered field at a collection of receivers outside\nthe object. One of the standard approaches for solving this problem is to\nreformulate it as an optimization problem: finding the boundary of the domain\nthat minimizes the $L^2$ distance between computed values of the scattered\nfield and the given measurement data. The optimization problem is\ncomputationally challenging since the local set of convexity shrinks with\nincreasing frequency and results in an increasing number of local minima in the\nvicinity of the true solution. In many practical experimental settings, low\nfrequency measurements are unavailable due to limitations of the experimental\nsetup or the sensors used for measurement. Thus, obtaining a good initial guess\nfor the optimization problem plays a vital role in this environment.\n  We present a neural network warm-start approach for solving the inverse\nscattering problem, where an initial guess for the optimization problem is\nobtained using a trained neural network. We demonstrate the effectiveness of\nour method with several numerical examples. For high frequency problems, this\napproach outperforms traditional iterative methods such as Gauss-Newton\ninitialized without any prior (i.e., initialized using a unit circle), or\ninitialized using the solution of a direct method such as the linear sampling\nmethod. The algorithm remains robust to noise in the scattered field\nmeasurements and also converges to the true solution for limited aperture data.\nHowever, the number of training samples required to train the neural network\nscales exponentially in frequency and the complexity of the obstacles\nconsidered. We conclude with a discussion of this phenomenon and potential\ndirections for future research.\n","authors":["Mo Zhou","Jiequn Han","Manas Rachh","Carlos Borges"],"pdf_url":"https://arxiv.org/pdf/2212.08736v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.07901v3","updated":"2023-08-03T11:36:06Z","published":"2022-02-16T07:09:04Z","title":"Auxiliary Cross-Modal Representation Learning with Triplet Loss\n  Functions for Online Handwriting Recognition","summary":"  Cross-modal representation learning learns a shared embedding between two or\nmore modalities to improve performance in a given task compared to using only\none of the modalities. Cross-modal representation learning from different data\ntypes -- such as images and time-series data (e.g., audio or text data) --\nrequires a deep metric learning loss that minimizes the distance between the\nmodality embeddings. In this paper, we propose to use the contrastive or\ntriplet loss, which uses positive and negative identities to create sample\npairs with different labels, for cross-modal representation learning between\nimage and time-series modalities (CMR-IS). By adapting the triplet loss for\ncross-modal representation learning, higher accuracy in the main (time-series\nclassification) task can be achieved by exploiting additional information of\nthe auxiliary (image classification) task. We present a triplet loss with a\ndynamic margin for single label and sequence-to-sequence classification tasks.\nWe perform extensive evaluations on synthetic image and time-series data, and\non data for offline handwriting recognition (HWR) and on online HWR from\nsensor-enhanced pens for classifying written words. Our experiments show an\nimproved classification accuracy, faster convergence, and better\ngeneralizability due to an improved cross-modal representation. Furthermore,\nthe more suitable generalizability leads to a better adaptability between\nwriters for online HWR.\n","authors":["Felix Ott","David Rügamer","Lucas Heublein","Bernd Bischl","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2202.07901v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01682v1","updated":"2023-08-03T10:48:37Z","published":"2023-08-03T10:48:37Z","title":"Evaluating Link Prediction Explanations for Graph Neural Networks","summary":"  Graph Machine Learning (GML) has numerous applications, such as node/graph\nclassification and link prediction, in real-world domains. Providing\nhuman-understandable explanations for GML models is a challenging yet\nfundamental task to foster their adoption, but validating explanations for link\nprediction models has received little attention. In this paper, we provide\nquantitative metrics to assess the quality of link prediction explanations,\nwith or without ground-truth. State-of-the-art explainability methods for Graph\nNeural Networks are evaluated using these metrics. We discuss how underlying\nassumptions and technical details specific to the link prediction task, such as\nthe choice of distance between node embeddings, can influence the quality of\nthe explanations.\n","authors":["Claudio Borile","Alan Perotti","André Panisson"],"pdf_url":"https://arxiv.org/pdf/2308.01682v1.pdf","comment":"This work has been accepted to be presented to The 1st World\n  Conference on eXplainable Artificial Intelligence (xAI 2023), July 26-28,\n  2023 - Lisboa, Portugal"},{"id":"http://arxiv.org/abs/2308.01677v1","updated":"2023-08-03T10:31:22Z","published":"2023-08-03T10:31:22Z","title":"Efficiency of First-Order Methods for Low-Rank Tensor Recovery with the\n  Tensor Nuclear Norm Under Strict Complementarity","summary":"  We consider convex relaxations for recovering low-rank tensors based on\nconstrained minimization over a ball induced by the tensor nuclear norm,\nrecently introduced in \\cite{tensor_tSVD}. We build on a recent line of results\nthat considered convex relaxations for the recovery of low-rank matrices and\nestablished that under a strict complementarity condition (SC), both the\nconvergence rate and per-iteration runtime of standard gradient methods may\nimprove dramatically. We develop the appropriate strict complementarity\ncondition for the tensor nuclear norm ball and obtain the following main\nresults under this condition: 1. When the objective to minimize is of the form\n$f(\\mX)=g(\\mA\\mX)+\\langle{\\mC,\\mX}\\rangle$ , where $g$ is strongly convex and\n$\\mA$ is a linear map (e.g., least squares), a quadratic growth bound holds,\nwhich implies linear convergence rates for standard projected gradient methods,\ndespite the fact that $f$ need not be strongly convex. 2. For a smooth\nobjective function, when initialized in certain proximity of an optimal\nsolution which satisfies SC, standard projected gradient methods only require\nSVD computations (for projecting onto the tensor nuclear norm ball) of rank\nthat matches the tubal rank of the optimal solution. In particular, when the\ntubal rank is constant, this implies nearly linear (in the size of the tensor)\nruntime per iteration, as opposed to super linear without further assumptions.\n3. For a nonsmooth objective function which admits a popular smooth\nsaddle-point formulation, we derive similar results to the latter for the well\nknown extragradient method. An additional contribution which may be of\nindependent interest, is the rigorous extension of many basic results regarding\ntensors of arbitrary order, which were previously obtained only for third-order\ntensors.\n","authors":["Dan Garber","Atara Kaplan"],"pdf_url":"https://arxiv.org/pdf/2308.01677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01674v1","updated":"2023-08-03T10:21:53Z","published":"2023-08-03T10:21:53Z","title":"End-to-End Reinforcement Learning of Koopman Models for Economic\n  Nonlinear MPC","summary":"  (Economic) nonlinear model predictive control ((e)NMPC) requires dynamic\nsystem models that are sufficiently accurate in all relevant state-space\nregions. These models must also be computationally cheap enough to ensure\nreal-time tractability. Data-driven surrogate models for mechanistic models can\nbe used to reduce the computational burden of (e)NMPC; however, such models are\ntypically trained by system identification for maximum average prediction\naccuracy on simulation samples and perform suboptimally as part of actual\n(e)NMPC. We present a method for end-to-end reinforcement learning of dynamic\nsurrogate models for optimal performance in (e)NMPC applications, resulting in\npredictive controllers that strike a favorable balance between control\nperformance and computational demand. We validate our method on two\napplications derived from an established nonlinear continuous stirred-tank\nreactor model. We compare the controller performance to that of MPCs utilizing\nmodels trained by the prevailing maximum prediction accuracy paradigm, and\nmodel-free neural network controllers trained using reinforcement learning. We\nshow that our method matches the performance of the model-free neural network\ncontrollers while consistently outperforming models derived from system\nidentification. Additionally, we show that the MPC policies can react to\nchanges in the control setting without retraining.\n","authors":["Daniel Mayfrank","Alexander Mitsos","Manuel Dahmen"],"pdf_url":"https://arxiv.org/pdf/2308.01674v1.pdf","comment":"manuscript (18 pages, 7 figures, 5 tables), supplementary materials\n  (3 pages, 2 tables)"},{"id":"http://arxiv.org/abs/2303.05118v3","updated":"2023-08-03T09:47:46Z","published":"2023-03-09T08:57:01Z","title":"SLCA: Slow Learner with Classifier Alignment for Continual Learning on a\n  Pre-trained Model","summary":"  The goal of continual learning is to improve the performance of recognition\nmodels in learning sequentially arrived data. Although most existing works are\nestablished on the premise of learning from scratch, growing efforts have been\ndevoted to incorporating the benefits of pre-training. However, how to\nadaptively exploit the pre-trained knowledge for each incremental task while\nmaintaining its generalizability remains an open question. In this work, we\npresent an extensive analysis for continual learning on a pre-trained model\n(CLPM), and attribute the key challenge to a progressive overfitting problem.\nObserving that selectively reducing the learning rate can almost resolve this\nissue in the representation layer, we propose a simple but extremely effective\napproach named Slow Learner with Classifier Alignment (SLCA), which further\nimproves the classification layer by modeling the class-wise distributions and\naligning the classification layers in a post-hoc fashion. Across a variety of\nscenarios, our proposal provides substantial improvements for CLPM (e.g., up to\n49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split\nCUB-200 and Split Cars-196, respectively), and thus outperforms\nstate-of-the-art approaches by a large margin. Based on such a strong baseline,\ncritical factors and promising directions are analyzed in-depth to facilitate\nsubsequent research. Code has been made available at:\nhttps://github.com/GengDavid/SLCA.\n","authors":["Gengwei Zhang","Liyuan Wang","Guoliang Kang","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2303.05118v3.pdf","comment":"Accepted by ICCV 2023, code released"},{"id":"http://arxiv.org/abs/2308.01650v1","updated":"2023-08-03T09:32:50Z","published":"2023-08-03T09:32:50Z","title":"UniG-Encoder: A Universal Feature Encoder for Graph and Hypergraph Node\n  Classification","summary":"  Graph and hypergraph representation learning has attracted increasing\nattention from various research fields. Despite the decent performance and\nfruitful applications of Graph Neural Networks (GNNs), Hypergraph Neural\nNetworks (HGNNs), and their well-designed variants, on some commonly used\nbenchmark graphs and hypergraphs, they are outperformed by even a simple\nMulti-Layer Perceptron. This observation motivates a reexamination of the\ndesign paradigm of the current GNNs and HGNNs and poses challenges of\nextracting graph features effectively. In this work, a universal feature\nencoder for both graph and hypergraph representation learning is designed,\ncalled UniG-Encoder. The architecture starts with a forward transformation of\nthe topological relationships of connected nodes into edge or hyperedge\nfeatures via a normalized projection matrix. The resulting edge/hyperedge\nfeatures, together with the original node features, are fed into a neural\nnetwork. The encoded node embeddings are then derived from the reversed\ntransformation, described by the transpose of the projection matrix, of the\nnetwork's output, which can be further used for tasks such as node\nclassification. The proposed architecture, in contrast to the traditional\nspectral-based and/or message passing approaches, simultaneously and\ncomprehensively exploits the node features and graph/hypergraph topologies in\nan efficient and unified manner, covering both heterophilic and homophilic\ngraphs. The designed projection matrix, encoding the graph features, is\nintuitive and interpretable. Extensive experiments are conducted and\ndemonstrate the superior performance of the proposed framework on twelve\nrepresentative hypergraph datasets and six real-world graph datasets, compared\nto the state-of-the-art methods. Our implementation is available online at\nhttps://github.com/MinhZou/UniG-Encoder.\n","authors":["Minhao Zou","Zhongxue Gan","Yutong Wang","Junheng Zhang","Dongyan Sui","Chun Guan","Siyang Leng"],"pdf_url":"https://arxiv.org/pdf/2308.01650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01649v1","updated":"2023-08-03T09:31:45Z","published":"2023-08-03T09:31:45Z","title":"MARLIM: Multi-Agent Reinforcement Learning for Inventory Management","summary":"  Maintaining a balance between the supply and demand of products by optimizing\nreplenishment decisions is one of the most important challenges in the supply\nchain industry. This paper presents a novel reinforcement learning framework\ncalled MARLIM, to address the inventory management problem for a single-echelon\nmulti-products supply chain with stochastic demands and lead-times. Within this\ncontext, controllers are developed through single or multiple agents in a\ncooperative setting. Numerical experiments on real data demonstrate the\nbenefits of reinforcement learning methods over traditional baselines.\n","authors":["Rémi Leluc","Elie Kadoche","Antoine Bertoncello","Sébastien Gourvénec"],"pdf_url":"https://arxiv.org/pdf/2308.01649v1.pdf","comment":"Accepted at NeurIPS 2022 Workshop: Reinforcement Learning for Real\n  Life (https://nips.cc/virtual/2022/workshop/50014)"},{"id":"http://arxiv.org/abs/2111.12146v6","updated":"2023-08-03T09:17:35Z","published":"2021-11-23T20:41:06Z","title":"Sharing to learn and learning to share -- Fitting together\n  Meta-Learning, Multi-Task Learning, and Transfer Learning: A meta review","summary":"  Integrating knowledge across different domains is an essential feature of\nhuman learning. Learning paradigms such as transfer learning, meta learning,\nand multi-task learning reflect the human learning process by exploiting the\nprior knowledge for new tasks, encouraging faster learning and good\ngeneralization for new tasks. This article gives a detailed view of these\nlearning paradigms and their comparative analysis. The weakness of one learning\nalgorithm turns out to be a strength of another, and thus merging them is a\nprevalent trait in the literature. There are numerous research papers that\nfocus on each of these learning paradigms separately and provide a\ncomprehensive overview of them. However, this article provides a review of\nresearch studies that combine (two of) these learning algorithms. This survey\ndescribes how these techniques are combined to solve problems in many different\nfields of study, including computer vision, natural language processing,\nhyperspectral imaging, and many more, in supervised setting only. As a result,\nthe global generic learning network an amalgamation of meta learning, transfer\nlearning, and multi-task learning is introduced here, along with some open\nresearch questions and future research directions in the multi-task setting.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2111.12146v6.pdf","comment":"34 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.01626v1","updated":"2023-08-03T08:56:56Z","published":"2023-08-03T08:56:56Z","title":"Interleaving GANs with knowledge graphs to support design creativity for\n  book covers","summary":"  An attractive book cover is important for the success of a book. In this\npaper, we apply Generative Adversarial Networks (GANs) to the book covers\ndomain, using different methods for training in order to obtain better\ngenerated images. We interleave GANs with knowledge graphs to alter the input\ntitle to obtain multiple possible options for any given title, which are then\nused as an augmented input to the generator. Finally, we use the discriminator\nobtained during the training phase to select the best images generated with new\ntitles. Our method performed better at generating book covers than previous\nattempts, and the knowledge graph gives better options to the book author or\neditor compared to using GANs alone.\n","authors":["Alexandru Motogna","Adrian Groza"],"pdf_url":"https://arxiv.org/pdf/2308.01626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01621v1","updated":"2023-08-03T08:50:48Z","published":"2023-08-03T08:50:48Z","title":"A Novel Convolutional Neural Network Architecture with a Continuous\n  Symmetry","summary":"  This paper introduces a new Convolutional Neural Network (ConvNet)\narchitecture inspired by a class of partial differential equations (PDEs)\ncalled quasi-linear hyperbolic systems. With comparable performance on image\nclassification task, it allows for the modification of the weights via a\ncontinuous group of symmetry. This is a significant shift from traditional\nmodels where the architecture and weights are essentially fixed. We wish to\npromote the (internal) symmetry as a new desirable property for a neural\nnetwork, and to draw attention to the PDE perspective in analyzing and\ninterpreting ConvNets in the broader Deep Learning community.\n","authors":["Yao Liu","Hang Shao","Bing Bai"],"pdf_url":"https://arxiv.org/pdf/2308.01621v1.pdf","comment":"Accepted by the 3rd CAAI International Conference on Artificial\n  Intelligence (CICAI), 2023"},{"id":"http://arxiv.org/abs/2308.01614v1","updated":"2023-08-03T08:41:39Z","published":"2023-08-03T08:41:39Z","title":"Assessing Systematic Weaknesses of DNNs using Counterfactuals","summary":"  With the advancement of DNNs into safety-critical applications, testing\napproaches for such models have gained more attention. A current direction is\nthe search for and identification of systematic weaknesses that put safety\nassumptions based on average performance values at risk. Such weaknesses can\ntake on the form of (semantically coherent) subsets or areas in the input space\nwhere a DNN performs systematically worse than its expected average. However,\nit is non-trivial to attribute the reason for such observed low performances to\nthe specific semantic features that describe the subset. For instance,\ninhomogeneities within the data w.r.t. other (non-considered) attributes might\ndistort results. However, taking into account all (available) attributes and\ntheir interaction is often computationally highly expensive. Inspired by\ncounterfactual explanations, we propose an effective and computationally cheap\nalgorithm to validate the semantic attribution of existing subsets, i.e., to\ncheck whether the identified attribute is likely to have caused the degraded\nperformance. We demonstrate this approach on an example from the autonomous\ndriving domain using highly annotated simulated data, where we show for a\nsemantic segmentation model that (i) performance differences among the\ndifferent pedestrian assets exist, but (ii) only in some cases is the asset\ntype itself the reason for this reduction in the performance.\n","authors":["Sujan Sai Gannamaneni","Michael Mock","Maram Akila"],"pdf_url":"https://arxiv.org/pdf/2308.01614v1.pdf","comment":"AAAI Spring Symposium 2023"},{"id":"http://arxiv.org/abs/2304.12729v2","updated":"2023-08-03T08:35:20Z","published":"2023-04-25T11:19:47Z","title":"Morphological Classification of Extragalactic Radio Sources Using\n  Gradient Boosting Methods","summary":"  The field of radio astronomy is witnessing a boom in the amount of data\nproduced per day due to newly commissioned radio telescopes. One of the most\ncrucial problems in this field is the automatic classification of extragalactic\nradio sources based on their morphologies. Most recent contributions in the\nfield of morphological classification of extragalactic radio sources have\nproposed classifiers based on convolutional neural networks. Alternatively,\nthis work proposes gradient boosting machine learning methods accompanied by\nprincipal component analysis as data-efficient alternatives to convolutional\nneural networks. Recent findings have shown the efficacy of gradient boosting\nmethods in outperforming deep learning methods for classification problems with\ntabular data. The gradient boosting methods considered in this work are based\non the XGBoost, LightGBM, and CatBoost implementations. This work also studies\nthe effect of dataset size on classifier performance. A three-class\nclassification problem is considered in this work based on the three main\nFanaroff-Riley classes: class 0, class I, and class II, using radio sources\nfrom the Best-Heckman sample. All three proposed gradient boosting methods\noutperformed a state-of-the-art convolutional neural networks-based classifier\nusing less than a quarter of the number of images, with CatBoost having the\nhighest accuracy. This was mainly due to the superior accuracy of gradient\nboosting methods in classifying Fanaroff-Riley class II sources, with\n3$\\unicode{x2013}$4% higher recall.\n","authors":["Abdollah Masoud Darya","Ilias Fernini","Marley Vellasco","Abir Hussain"],"pdf_url":"https://arxiv.org/pdf/2304.12729v2.pdf","comment":"The peer-reviewed paper was presented at The 2023 International Joint\n  Conference on Neural Networks (IJCNN) and published on IEEE Xplore. The code\n  and dataset used in this work are available from\n  https://github.com/AbdollahMasoud/IJCNN-2023"},{"id":"http://arxiv.org/abs/2308.01609v1","updated":"2023-08-03T08:31:31Z","published":"2023-08-03T08:31:31Z","title":"Feature Noise Boosts DNN Generalization under Label Noise","summary":"  The presence of label noise in the training data has a profound impact on the\ngeneralization of deep neural networks (DNNs). In this study, we introduce and\ntheoretically demonstrate a simple feature noise method, which directly adds\nnoise to the features of training data, can enhance the generalization of DNNs\nunder label noise. Specifically, we conduct theoretical analyses to reveal that\nlabel noise leads to weakened DNN generalization by loosening the PAC-Bayes\ngeneralization bound, and feature noise results in better DNN generalization by\nimposing an upper bound on the mutual information between the model weights and\nthe features, which constrains the PAC-Bayes generalization bound. Furthermore,\nto ensure effective generalization of DNNs in the presence of label noise, we\nconduct application analyses to identify the optimal types and levels of\nfeature noise to add for obtaining desirable label noise generalization.\nFinally, extensive experimental results on several popular datasets demonstrate\nthe feature noise method can significantly enhance the label noise\ngeneralization of the state-of-the-art label noise method.\n","authors":["Lu Zeng","Xuan Chen","Xiaoshuang Shi","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.01609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01606v1","updated":"2023-08-03T08:24:08Z","published":"2023-08-03T08:24:08Z","title":"Unsupervised Multiplex Graph Learning with Complementary and Consistent\n  Information","summary":"  Unsupervised multiplex graph learning (UMGL) has been shown to achieve\nsignificant effectiveness for different downstream tasks by exploring both\ncomplementary information and consistent information among multiple graphs.\nHowever, previous methods usually overlook the issues in practical\napplications, i.e., the out-of-sample issue and the noise issue. To address the\nabove issues, in this paper, we propose an effective and efficient UMGL method\nto explore both complementary and consistent information. To do this, our\nmethod employs multiple MLP encoders rather than graph convolutional network\n(GCN) to conduct representation learning with two constraints, i.e., preserving\nthe local graph structure among nodes to handle the out-of-sample issue, and\nmaximizing the correlation of multiple node representations to handle the noise\nissue. Comprehensive experiments demonstrate that our proposed method achieves\nsuperior effectiveness and efficiency over the comparison methods and\neffectively tackles those two issues. Code is available at\nhttps://github.com/LarryUESTC/CoCoMG.\n","authors":["Liang Peng","Xin Wang","Xiaofeng Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.01606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14720v2","updated":"2023-08-03T08:18:36Z","published":"2022-12-30T14:01:41Z","title":"Learning from Data Streams: An Overview and Update","summary":"  The literature on machine learning in the context of data streams is vast and\ngrowing. However, many of the defining assumptions regarding data-stream\nlearning tasks are too strong to hold in practice, or are even contradictory\nsuch that they cannot be met in the contexts of supervised learning. Algorithms\nare chosen and designed based on criteria which are often not clearly stated,\nfor problem settings not clearly defined, tested in unrealistic settings,\nand/or in isolation from related approaches in the wider literature. This puts\ninto question the potential for real-world impact of many approaches conceived\nin such contexts, and risks propagating a misguided research focus. We propose\nto tackle these issues by reformulating the fundamental definitions and\nsettings of supervised data-stream learning with regard to contemporary\nconsiderations of concept drift and temporal dependence; and we take a fresh\nlook at what constitutes a supervised data-stream learning task, and a\nreconsideration of algorithms that may be applied to tackle such tasks. Through\nand in reflection of this formulation and overview, helped by an informal\nsurvey of industrial players dealing with real-world data streams, we provide\nrecommendations. Our main emphasis is that learning from data streams does not\nimpose a single-pass or online-learning approach, or any particular learning\nregime; and any constraints on memory and time are not specific to streaming.\nMeanwhile, there exist established techniques for dealing with temporal\ndependence and concept drift, in other areas of the literature. For the data\nstreams community, we thus encourage a shift in research focus, from dealing\nwith often-artificial constraints and assumptions on the learning mode, to\nissues such as robustness, privacy, and interpretability which are increasingly\nrelevant to learning in data streams in academic and industrial settings.\n","authors":["Jesse Read","Indrė Žliobaitė"],"pdf_url":"https://arxiv.org/pdf/2212.14720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01602v1","updated":"2023-08-03T08:14:28Z","published":"2023-08-03T08:14:28Z","title":"Deep Learning-based surrogate models for parametrized PDEs: handling\n  geometric variability through graph neural networks","summary":"  Mesh-based simulations play a key role when modeling complex physical systems\nthat, in many disciplines across science and engineering, require the solution\nof parametrized time-dependent nonlinear partial differential equations (PDEs).\nIn this context, full order models (FOMs), such as those relying on the finite\nelement method, can reach high levels of accuracy, however often yielding\nintensive simulations to run. For this reason, surrogate models are developed\nto replace computationally expensive solvers with more efficient ones, which\ncan strike favorable trade-offs between accuracy and efficiency. This work\nexplores the potential usage of graph neural networks (GNNs) for the simulation\nof time-dependent PDEs in the presence of geometrical variability. In\nparticular, we propose a systematic strategy to build surrogate models based on\na data-driven time-stepping scheme where a GNN architecture is used to\nefficiently evolve the system. With respect to the majority of surrogate\nmodels, the proposed approach stands out for its ability of tackling problems\nwith parameter dependent spatial domains, while simultaneously generalizing to\ndifferent geometries and mesh resolutions. We assess the effectiveness of the\nproposed approach through a series of numerical experiments, involving both\ntwo- and three-dimensional problems, showing that GNNs can provide a valid\nalternative to traditional surrogate models in terms of computational\nefficiency and generalization to new scenarios. We also assess, from a\nnumerical standpoint, the importance of using GNNs, rather than classical dense\ndeep neural networks, for the proposed framework.\n","authors":["Nicola Rares Franco","Stefania Fresca","Filippo Tombari","Andrea Manzoni"],"pdf_url":"https://arxiv.org/pdf/2308.01602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2005.09048v3","updated":"2023-08-03T08:10:24Z","published":"2020-05-18T19:45:04Z","title":"Stable and consistent density-based clustering via multiparameter\n  persistence","summary":"  We consider the degree-Rips construction from topological data analysis,\nwhich provides a density-sensitive, multiparameter hierarchical clustering\nalgorithm. We analyze its stability to perturbations of the input data using\nthe correspondence-interleaving distance, a metric for hierarchical clusterings\nthat we introduce. Taking certain one-parameter slices of degree-Rips recovers\nwell-known methods for density-based clustering, but we show that these methods\nare unstable. However, we prove that degree-Rips, as a multiparameter object,\nis stable, and we propose an alternative approach for taking slices of\ndegree-Rips, which yields a one-parameter hierarchical clustering algorithm\nwith better stability properties. We prove that this algorithm is consistent,\nusing the correspondence-interleaving distance. We provide an algorithm for\nextracting a single clustering from one-parameter hierarchical clusterings,\nwhich is stable with respect to the correspondence-interleaving distance. And,\nwe integrate these methods into a pipeline for density-based clustering, which\nwe call Persistable. Adapting tools from multiparameter persistent homology, we\npropose visualization tools that guide the selection of all parameters of the\npipeline. We demonstrate Persistable on benchmark datasets, showing that it\nidentifies multi-scale cluster structure in data.\n","authors":["Alexander Rolle","Luis Scoccola"],"pdf_url":"https://arxiv.org/pdf/2005.09048v3.pdf","comment":"68 pages, 16 figures. v3: major changes to exposition, significant\n  additions to content, some mathematical reformulations"},{"id":"http://arxiv.org/abs/2308.01578v1","updated":"2023-08-03T07:28:06Z","published":"2023-08-03T07:28:06Z","title":"Unsupervised Representation Learning for Time Series: A Review","summary":"  Unsupervised representation learning approaches aim to learn discriminative\nfeature representations from unlabeled data, without the requirement of\nannotating every sample. Enabling unsupervised representation learning is\nextremely crucial for time series data, due to its unique annotation bottleneck\ncaused by its complex characteristics and lack of visual cues compared with\nother data modalities. In recent years, unsupervised representation learning\ntechniques have advanced rapidly in various domains. However, there is a lack\nof systematic analysis of unsupervised representation learning approaches for\ntime series. To fill the gap, we conduct a comprehensive literature review of\nexisting rapidly evolving unsupervised representation learning approaches for\ntime series. Moreover, we also develop a unified and standardized library,\nnamed ULTS (i.e., Unsupervised Learning for Time Series), to facilitate fast\nimplementations and unified evaluations on various models. With ULTS, we\nempirically evaluate state-of-the-art approaches, especially the rapidly\nevolving contrastive learning methods, on 9 diverse real-world datasets. We\nfurther discuss practical considerations as well as open research challenges on\nunsupervised representation learning for time series to facilitate future\nresearch in this field.\n","authors":["Qianwen Meng","Hangwei Qian","Yong Liu","Yonghui Xu","Zhiqi Shen","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2308.01578v1.pdf","comment":"In submission to IEEE"},{"id":"http://arxiv.org/abs/2308.01573v1","updated":"2023-08-03T07:22:04Z","published":"2023-08-03T07:22:04Z","title":"Adversarial Training of Denoising Diffusion Model Using Dual\n  Discriminators for High-Fidelity Multi-Speaker TTS","summary":"  The diffusion model is capable of generating high-quality data through a\nprobabilistic approach. However, it suffers from the drawback of slow\ngeneration speed due to the requirement of a large number of time steps. To\naddress this limitation, recent models such as denoising diffusion implicit\nmodels (DDIM) focus on generating samples without directly modeling the\nprobability distribution, while models like denoising diffusion generative\nadversarial networks (GAN) combine diffusion processes with GANs. In the field\nof speech synthesis, a recent diffusion speech synthesis model called\nDiffGAN-TTS, utilizing the structure of GANs, has been introduced and\ndemonstrates superior performance in both speech quality and generation speed.\nIn this paper, to further enhance the performance of DiffGAN-TTS, we propose a\nspeech synthesis model with two discriminators: a diffusion discriminator for\nlearning the distribution of the reverse process and a spectrogram\ndiscriminator for learning the distribution of the generated data. Objective\nmetrics such as structural similarity index measure (SSIM), mel-cepstral\ndistortion (MCD), F0 root mean squared error (F0 RMSE), short-time objective\nintelligibility (STOI), perceptual evaluation of speech quality (PESQ), as well\nas subjective metrics like mean opinion score (MOS), are used to evaluate the\nperformance of the proposed model. The evaluation results show that the\nproposed model outperforms recent state-of-the-art models such as FastSpeech2\nand DiffGAN-TTS in various metrics. Our implementation and audio samples are\nlocated on GitHub.\n","authors":["Myeongjin Ko","Yong-Hoon Choi"],"pdf_url":"https://arxiv.org/pdf/2308.01573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01566v1","updated":"2023-08-03T07:13:27Z","published":"2023-08-03T07:13:27Z","title":"Fast Slate Policy Optimization: Going Beyond Plackett-Luce","summary":"  An increasingly important building block of large scale machine learning\nsystems is based on returning slates; an ordered lists of items given a query.\nApplications of this technology include: search, information retrieval and\nrecommender systems. When the action space is large, decision systems are\nrestricted to a particular structure to complete online queries quickly. This\npaper addresses the optimization of these large scale decision systems given an\narbitrary reward function. We cast this learning problem in a policy\noptimization framework and propose a new class of policies, born from a novel\nrelaxation of decision functions. This results in a simple, yet efficient\nlearning algorithm that scales to massive action spaces. We compare our method\nto the commonly adopted Plackett-Luce policy class and demonstrate the\neffectiveness of our approach on problems with action space sizes in the order\nof millions.\n","authors":["Otmane Sakhi","David Rohde","Nicolas Chopin"],"pdf_url":"https://arxiv.org/pdf/2308.01566v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.01562v1","updated":"2023-08-03T07:03:33Z","published":"2023-08-03T07:03:33Z","title":"Hierarchical Federated Learning in Wireless Networks: Pruning Tackles\n  Bandwidth Scarcity and System Heterogeneity","summary":"  While a practical wireless network has many tiers where end users do not\ndirectly communicate with the central server, the users' devices have limited\ncomputation and battery powers, and the serving base station (BS) has a fixed\nbandwidth. Owing to these practical constraints and system models, this paper\nleverages model pruning and proposes a pruning-enabled hierarchical federated\nlearning (PHFL) in heterogeneous networks (HetNets). We first derive an upper\nbound of the convergence rate that clearly demonstrates the impact of the model\npruning and wireless communications between the clients and the associated BS.\nThen we jointly optimize the model pruning ratio, central processing unit (CPU)\nfrequency and transmission power of the clients in order to minimize the\ncontrollable terms of the convergence bound under strict delay and energy\nconstraints. However, since the original problem is not convex, we perform\nsuccessive convex approximation (SCA) and jointly optimize the parameters for\nthe relaxed convex problem. Through extensive simulation, we validate the\neffectiveness of our proposed PHFL algorithm in terms of test accuracy, wall\nclock time, energy consumption and bandwidth requirement.\n","authors":["Md Ferdous Pervej","Richeng Jin","Huaiyu Dai"],"pdf_url":"https://arxiv.org/pdf/2308.01562v1.pdf","comment":"Under review for possible publications in IEEE TWC"},{"id":"http://arxiv.org/abs/2307.16149v2","updated":"2023-08-03T07:00:53Z","published":"2023-07-30T07:16:56Z","title":"An Effective LSTM-DDPM Scheme for Energy Theft Detection and Forecasting\n  in Smart Grid","summary":"  Energy theft detection (ETD) and energy consumption forecasting (ECF) are two\ninterconnected challenges in smart grid systems. Addressing these issues\ncollectively is crucial for ensuring system security. This paper addresses the\ninterconnected challenges of ETD and ECF in smart grid systems. The proposed\nsolution combines long short-term memory (LSTM) and a denoising diffusion\nprobabilistic model (DDPM) to generate input reconstruction and forecasting. By\nleveraging the reconstruction and forecasting errors, the system identifies\ninstances of energy theft, with the methods based on reconstruction error and\nforecasting error complementing each other in detecting different types of\nattacks. Through extensive experiments on real-world and synthetic datasets,\nthe proposed scheme outperforms baseline methods in ETD and ECF problems. The\nensemble method significantly enhances ETD performance, accurately detecting\nenergy theft attacks that baseline methods fail to detect. The research offers\na comprehensive and effective solution for addressing ETD and ECF challenges,\ndemonstrating promising results and improved security in smart grid systems.\n","authors":["Xun Yuan","Yang Yang","Arwa Alromih","Prosanta Gope","Biplab Sikdar"],"pdf_url":"https://arxiv.org/pdf/2307.16149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01557v1","updated":"2023-08-03T06:36:21Z","published":"2023-08-03T06:36:21Z","title":"Motion Planning Diffusion: Learning and Planning of Robot Motions with\n  Diffusion Models","summary":"  Learning priors on trajectory distributions can help accelerate robot motion\nplanning optimization. Given previously successful plans, learning trajectory\ngenerative models as priors for a new planning problem is highly desirable.\nPrior works propose several ways on utilizing this prior to bootstrapping the\nmotion planning problem. Either sampling the prior for initializations or using\nthe prior distribution in a maximum-a-posterior formulation for trajectory\noptimization. In this work, we propose learning diffusion models as priors. We\nthen can sample directly from the posterior trajectory distribution conditioned\non task goals, by leveraging the inverse denoising process of diffusion models.\nFurthermore, diffusion has been recently shown to effectively encode data\nmultimodality in high-dimensional settings, which is particularly well-suited\nfor large trajectory dataset. To demonstrate our method efficacy, we compare\nour proposed method - Motion Planning Diffusion - against several baselines in\nsimulated planar robot and 7-dof robot arm manipulator environments. To assess\nthe generalization capabilities of our method, we test it in environments with\npreviously unseen obstacles. Our experiments show that diffusion models are\nstrong priors to encode high-dimensional trajectory distributions of robot\nmotions.\n","authors":["Joao Carvalho","An T. Le","Mark Baierl","Dorothea Koert","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2308.01557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01552v1","updated":"2023-08-03T06:19:58Z","published":"2023-08-03T06:19:58Z","title":"InterAct: Exploring the Potentials of ChatGPT as a Cooperative Agent","summary":"  This research paper delves into the integration of OpenAI's ChatGPT into\nembodied agent systems, evaluating its influence on interactive decision-making\nbenchmark. Drawing a parallel to the concept of people assuming roles according\nto their unique strengths, we introduce InterAct. In this approach, we feed\nChatGPT with varied prompts, assigning it a numerous roles like a checker and a\nsorter, then integrating them with the original language model. Our research\nshows a remarkable success rate of 98% in AlfWorld, which consists of 6\ndifferent tasks in a simulated household environment, emphasizing the\nsignificance of proficient prompt engineering. The results highlight ChatGPT's\ncompetence in comprehending and performing intricate tasks effectively in\nreal-world settings, thus paving the way for further advancements in task\nplanning.\n","authors":["Po-Lin Chen","Cheng-Shang Chang"],"pdf_url":"https://arxiv.org/pdf/2308.01552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07898v2","updated":"2023-08-03T05:55:38Z","published":"2022-08-16T18:28:56Z","title":"Collaborative causal inference on distributed data","summary":"  The development of technologies for causal inference with the privacy\npreservation of distributed data has attracted considerable attention in recent\nyears. To address this issue, we propose a data collaboration quasi-experiment\n(DC-QE) that enables causal inference from distributed data with privacy\npreservation. In our method, first, local parties construct\ndimensionality-reduced intermediate representations from the private data.\nSecond, they share intermediate representations, instead of private data for\nprivacy preservation. Third, propensity scores were estimated from the shared\nintermediate representations. Finally, the treatment effects were estimated\nfrom propensity scores. Our method can reduce both random errors and biases,\nwhereas existing methods can only reduce random errors in the estimation of\ntreatment effects. Through numerical experiments on both artificial and\nreal-world data, we confirmed that our method can lead to better estimation\nresults than individual analyses. Dimensionality-reduction loses some of the\ninformation in the private data and causes performance degradation. However, we\nobserved that in the experiments, sharing intermediate representations with\nmany parties to resolve the lack of subjects and covariates, our method\nimproved performance enough to overcome the degradation caused by\ndimensionality-reduction. With the spread of our method, intermediate\nrepresentations can be published as open data to help researchers find\ncausalities and accumulated as a knowledge base.\n","authors":["Yuji Kawamata","Ryoki Motai","Yukihiko Okada","Akira Imakura","Tetsuya Sakurai"],"pdf_url":"https://arxiv.org/pdf/2208.07898v2.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.10803v2","updated":"2023-08-03T05:41:37Z","published":"2023-07-20T12:12:05Z","title":"Spatial-Temporal Data Mining for Ocean Science: Data, Methodologies, and\n  Opportunities","summary":"  With the rapid amassing of spatial-temporal (ST) ocean data, many\nspatial-temporal data mining (STDM) studies have been conducted to address\nvarious oceanic issues, including climate forecasting and disaster warning.\nCompared with typical ST data (e.g., traffic data), ST ocean data is more\ncomplicated but with unique characteristics, e.g., diverse regionality and high\nsparsity. These characteristics make it difficult to design and train STDM\nmodels on ST ocean data. To the best of our knowledge, a comprehensive survey\nof existing studies remains missing in the literature, which hinders not only\ncomputer scientists from identifying the research issues in ocean data mining\nbut also ocean scientists to apply advanced STDM techniques. In this paper, we\nprovide a comprehensive survey of existing STDM studies for ocean science.\nConcretely, we first review the widely-used ST ocean datasets and highlight\ntheir unique characteristics. Then, typical ST ocean data quality enhancement\ntechniques are explored. Next, we classify existing STDM studies in ocean\nscience into four types of tasks, i.e., prediction, event detection, pattern\nmining, and anomaly detection, and elaborate on the techniques for these tasks.\nFinally, promising research opportunities are discussed. This survey can help\nscientists from both computer science and ocean science better understand the\nfundamental concepts, key techniques, and open challenges of STDM for ocean\nscience.\n","authors":["Hanchen Yang","Wengen Li","Shuyu Wang","Hui Li","Jihong Guan","Shuigeng Zhou","Jiannong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.10803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01546v1","updated":"2023-08-03T05:35:37Z","published":"2023-08-03T05:35:37Z","title":"MusicLDM: Enhancing Novelty in Text-to-Music Generation Using\n  Beat-Synchronous Mixup Strategies","summary":"  Diffusion models have shown promising results in cross-modal generation\ntasks, including text-to-image and text-to-audio generation. However,\ngenerating music, as a special type of audio, presents unique challenges due to\nlimited availability of music data and sensitive issues related to copyright\nand plagiarism. In this paper, to tackle these challenges, we first construct a\nstate-of-the-art text-to-music model, MusicLDM, that adapts Stable Diffusion\nand AudioLDM architectures to the music domain. We achieve this by retraining\nthe contrastive language-audio pretraining model (CLAP) and the Hifi-GAN\nvocoder, as components of MusicLDM, on a collection of music data samples.\nThen, to address the limitations of training data and to avoid plagiarism, we\nleverage a beat tracking model and propose two different mixup strategies for\ndata augmentation: beat-synchronous audio mixup and beat-synchronous latent\nmixup, which recombine training audio directly or via a latent embeddings\nspace, respectively. Such mixup strategies encourage the model to interpolate\nbetween musical training samples and generate new music within the convex hull\nof the training data, making the generated music more diverse while still\nstaying faithful to the corresponding style. In addition to popular evaluation\nmetrics, we design several new evaluation metrics based on CLAP score to\ndemonstrate that our proposed MusicLDM and beat-synchronous mixup strategies\nimprove both the quality and novelty of generated music, as well as the\ncorrespondence between input text and generated music.\n","authors":["Ke Chen","Yusong Wu","Haohe Liu","Marianna Nezhurina","Taylor Berg-Kirkpatrick","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2308.01546v1.pdf","comment":"16 pages, 3 figures, 2 tables, demo page: https://musicldm.github.io/"},{"id":"http://arxiv.org/abs/2308.00788v2","updated":"2023-08-03T05:30:29Z","published":"2023-08-01T18:59:07Z","title":"An Introduction to Bi-level Optimization: Foundations and Applications\n  in Signal Processing and Machine Learning","summary":"  Recently, bi-level optimization (BLO) has taken center stage in some very\nexciting developments in the area of signal processing (SP) and machine\nlearning (ML). Roughly speaking, BLO is a classical optimization problem that\ninvolves two levels of hierarchy (i.e., upper and lower levels), wherein\nobtaining the solution to the upper-level problem requires solving the\nlower-level one. BLO has become popular largely because it is powerful in\nmodeling problems in SP and ML, among others, that involve optimizing nested\nobjective functions. Prominent applications of BLO range from resource\nallocation for wireless systems to adversarial machine learning. In this work,\nwe focus on a class of tractable BLO problems that often appear in SP and ML\napplications. We provide an overview of some basic concepts of this class of\nBLO problems, such as their optimality conditions, standard algorithms\n(including their optimization principles and practical implementations), as\nwell as how they can be leveraged to obtain state-of-the-art results for a\nnumber of key SP and ML applications. Further, we discuss some recent advances\nin BLO theory, its implications for applications, and point out some\nlimitations of the state-of-the-art that require significant future research\nefforts. Overall, we hope that this article can serve to accelerate the\nadoption of BLO as a generic tool to model, analyze, and innovate on a wide\narray of emerging SP and ML applications.\n","authors":["Yihua Zhang","Prashant Khanduri","Ioannis Tsaknakis","Yuguang Yao","Mingyi Hong","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2308.00788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01543v1","updated":"2023-08-03T05:23:07Z","published":"2023-08-03T05:23:07Z","title":"Lode Enhancer: Level Co-creation Through Scaling","summary":"  We explore AI-powered upscaling as a design assistance tool in the context of\ncreating 2D game levels. Deep neural networks are used to upscale artificially\ndownscaled patches of levels from the puzzle platformer game Lode Runner. The\ntrained networks are incorporated into a web-based editor, where the user can\ncreate and edit levels at three different levels of resolution: 4x4, 8x8, and\n16x16. An edit at any resolution instantly transfers to the other resolutions.\nAs upscaling requires inventing features that might not be present at lower\nresolutions, we train neural networks to reproduce these features. We introduce\na neural network architecture that is capable of not only learning upscaling\nbut also giving higher priority to less frequent tiles. To investigate the\npotential of this tool and guide further development, we conduct a qualitative\nstudy with 3 designers to understand how they use it. Designers enjoyed\nco-designing with the tool, liked its underlying concept, and provided feedback\nfor further improvement.\n","authors":["Debosmita Bhaumik","Julian Togelius","Georgios N. Yannakakis","Ahmed Khalifa"],"pdf_url":"https://arxiv.org/pdf/2308.01543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13495v2","updated":"2023-08-03T05:18:47Z","published":"2022-08-29T10:56:12Z","title":"A Missing Value Filling Model Based on Feature Fusion Enhanced\n  Autoencoder","summary":"  With the advent of the big data era, the data quality problem is becoming\nmore critical. Among many factors, data with missing values is one primary\nissue, and thus developing effective imputation models is a key topic in the\nresearch community. Recently, a major research direction is to employ neural\nnetwork models such as self-organizing mappings or automatic encoders for\nfilling missing values. However, these classical methods can hardly discover\ninterrelated features and common features simultaneously among data attributes.\nEspecially, it is a very typical problem for classical autoencoders that they\noften learn invalid constant mappings, which dramatically hurts the filling\nperformance. To solve the above-mentioned problems, we propose a\nmissing-value-filling model based on a feature-fusion-enhanced autoencoder. We\nfirst incorporate into an autoencoder a hidden layer that consists of\nde-tracking neurons and radial basis function neurons, which can enhance the\nability of learning interrelated features and common features. Besides, we\ndevelop a missing value filling strategy based on dynamic clustering that is\nincorporated into an iterative optimization process. This design can enhance\nthe multi-dimensional feature fusion ability and thus improves the dynamic\ncollaborative missing-value-filling performance. The effectiveness of the\nproposed model is validated by extensive experiments compared to a variety of\nbaseline methods on thirteen data sets.\n","authors":["Xinyao Liu","Shengdong Du","Tianrui Li","Fei Teng","Yan Yang"],"pdf_url":"https://arxiv.org/pdf/2208.13495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01536v1","updated":"2023-08-03T04:36:48Z","published":"2023-08-03T04:36:48Z","title":"MFIM: Megapixel Facial Identity Manipulation","summary":"  Face swapping is a task that changes a facial identity of a given image to\nthat of another person. In this work, we propose a novel face-swapping\nframework called Megapixel Facial Identity Manipulation (MFIM). The\nface-swapping model should achieve two goals. First, it should be able to\ngenerate a high-quality image. We argue that a model which is proficient in\ngenerating a megapixel image can achieve this goal. However, generating a\nmegapixel image is generally difficult without careful model design. Therefore,\nour model exploits pretrained StyleGAN in the manner of GAN-inversion to\neffectively generate a megapixel image. Second, it should be able to\neffectively transform the identity of a given image. Specifically, it should be\nable to actively transform ID attributes (e.g., face shape and eyes) of a given\nimage into those of another person, while preserving ID-irrelevant attributes\n(e.g., pose and expression). To achieve this goal, we exploit 3DMM that can\ncapture various facial attributes. Specifically, we explicitly supervise our\nmodel to generate a face-swapped image with the desirable attributes using\n3DMM. We show that our model achieves state-of-the-art performance through\nextensive experiments. Furthermore, we propose a new operation called ID\nmixing, which creates a new identity by semantically mixing the identities of\nseveral people. It allows the user to customize the new identity.\n","authors":["Sanghyeon Na"],"pdf_url":"https://arxiv.org/pdf/2308.01536v1.pdf","comment":"ECCV 2022 accepted"},{"id":"http://arxiv.org/abs/2306.13866v2","updated":"2023-08-03T04:34:00Z","published":"2023-06-24T05:10:43Z","title":"MIRACLE: Multi-task Learning based Interpretable Regulation of\n  Autoimmune Diseases through Common Latent Epigenetics","summary":"  DNA methylation is a crucial regulator of gene transcription and has been\nlinked to various diseases, including autoimmune diseases and cancers. However,\ndiagnostics based on DNA methylation face challenges due to large feature sets\nand small sample sizes, resulting in overfitting and suboptimal performance. To\naddress these issues, we propose MIRACLE, a novel interpretable neural network\nthat leverages autoencoder-based multi-task learning to integrate multiple\ndatasets and jointly identify common patterns in DNA methylation.\n  MIRACLE's architecture reflects the relationships between methylation sites,\ngenes, and pathways, ensuring biological interpretability and meaningfulness.\nThe network comprises an encoder and a decoder, with a bottleneck layer\nrepresenting pathway information as the basic unit of heredity. Customized\ndefined MaskedLinear Layer is constrained by site-gene-pathway graph adjacency\nmatrix information, which provides explainability and expresses the\nsite-gene-pathway hierarchical structure explicitly. And from the embedding,\nthere are different multi-task classifiers to predict diseases.\n  Tested on six datasets, including rheumatoid arthritis, systemic lupus\nerythematosus, multiple sclerosis, inflammatory bowel disease, psoriasis, and\ntype 1 diabetes, MIRACLE demonstrates robust performance in identifying common\nfunctions of DNA methylation across different phenotypes, with higher accuracy\nin prediction dieseases than baseline methods. By incorporating biological\nprior knowledge, MIRACLE offers a meaningful and interpretable framework for\nDNA methylation data analysis in the context of autoimmune diseases.\n","authors":["Pengcheng Xu","Jinpu Cai","Yulin Gao","Ziqi Rong"],"pdf_url":"https://arxiv.org/pdf/2306.13866v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11435v3","updated":"2023-08-03T04:28:15Z","published":"2022-08-24T11:01:47Z","title":"Bidirectional Contrastive Split Learning for Visual Question Answering","summary":"  Visual Question Answering (VQA) based on multi-modal data facilitates\nreal-life applications such as home robots and medical diagnoses. One\nsignificant challenge is to devise a robust decentralized learning framework\nfor various client models where centralized data collection is refrained due to\nconfidentiality concerns. This work aims to tackle privacy-preserving VQA by\ndecoupling a multi-modal model into representation modules and a contrastive\nmodule and leveraging inter-module gradients sharing and inter-client weight\nsharing. To this end, we propose Bidirectional Contrastive Split Learning\n(BiCSL) to train a global multi-modal model on the entire data distribution of\ndecentralized clients. We employ the contrastive loss that enables a more\nefficient self-supervised learning of decentralized modules. Comprehensive\nexperiments are conducted on the VQA-v2 dataset based on five SOTA VQA models,\ndemonstrating the effectiveness of the proposed method. Furthermore, we inspect\nBiCSL's robustness against a dual-key backdoor attack on VQA. Consequently,\nBiCSL shows much better robustness to the multi-modal adversarial attack\ncompared to the centralized learning method, which provides a promising\napproach to decentralized multi-modal learning.\n","authors":["Yuwei Sun","Hideya Ochiai"],"pdf_url":"https://arxiv.org/pdf/2208.11435v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13831v2","updated":"2023-08-03T03:51:33Z","published":"2023-07-25T21:59:17Z","title":"Relationship between Batch Size and Number of Steps Needed for Nonconvex\n  Optimization of Stochastic Gradient Descent using Armijo Line Search","summary":"  Stochastic gradient descent (SGD) is the simplest deep learning optimizer\nwith which to train deep neural networks. While SGD can use various learning\nrates, such as constant or diminishing rates, the previous numerical results\nshowed that SGD performs better than other deep learning optimizers using when\nit uses learning rates given by line search methods. In this paper, we perform\na convergence analysis on SGD with a learning rate given by an Armijo line\nsearch for nonconvex optimization. The analysis indicates that the upper bound\nof the expectation of the squared norm of the full gradient becomes small when\nthe number of steps and the batch size are large. Next, we show that, for SGD\nwith the Armijo-line-search learning rate, the number of steps needed for\nnonconvex optimization is a monotone decreasing convex function of the batch\nsize; that is, the number of steps needed for nonconvex optimization decreases\nas the batch size increases. Furthermore, we show that the stochastic\nfirst-order oracle (SFO) complexity, which is the stochastic gradient\ncomputation cost, is a convex function of the batch size; that is, there exists\na critical batch size that minimizes the SFO complexity. Finally, we provide\nnumerical results that support our theoretical results. The numerical results\nindicate that the number of steps needed for training deep neural networks\ndecreases as the batch size increases and that there exist the critical batch\nsizes that can be estimated from the theoretical results.\n","authors":["Yuki Tsukada","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2307.13831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v2","updated":"2023-08-03T03:23:25Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n  A Comprehensive Survey","summary":"  Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v2.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2307.13539v2","updated":"2023-08-03T03:22:48Z","published":"2023-07-25T14:40:11Z","title":"Model Calibration in Dense Classification with Adaptive Label\n  Perturbation","summary":"  For safety-related applications, it is crucial to produce trustworthy deep\nneural networks whose prediction is associated with confidence that can\nrepresent the likelihood of correctness for subsequent decision-making.\nExisting dense binary classification models are prone to being over-confident.\nTo improve model calibration, we propose Adaptive Stochastic Label Perturbation\n(ASLP) which learns a unique label perturbation level for each training image.\nASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,\nwhich unifies label perturbation processes including stochastic approaches\n(like DisturbLabel), and label smoothing, to correct calibration while\nmaintaining classification rates. ASLP follows Maximum Entropy Inference of\nclassic statistical mechanics to maximise prediction entropy with respect to\nmissing information. It performs this while: (1) preserving classification\naccuracy on known data as a conservative solution, or (2) specifically improves\nmodel calibration degree by minimising the gap between the prediction accuracy\nand expected confidence of the target training label. Extensive results\ndemonstrate that ASLP can significantly improve calibration degrees of dense\nbinary classification models on both in-distribution and out-of-distribution\ndata. The code is available on https://github.com/Carlisle-Liu/ASLP.\n","authors":["Jiawei Liu","Changkun Ye","Shan Wang","Ruikai Cui","Jing Zhang","Kaihao Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.13539v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.01508v1","updated":"2023-08-03T02:34:01Z","published":"2023-08-03T02:34:01Z","title":"Circumventing Concept Erasure Methods For Text-to-Image Generative\n  Models","summary":"  Text-to-image generative models can produce photo-realistic images for an\nextremely broad range of concepts, and their usage has proliferated widely\namong the general public. On the flip side, these models have numerous\ndrawbacks, including their potential to generate images featuring sexually\nexplicit content, mirror artistic styles without permission, or even\nhallucinate (or deepfake) the likenesses of celebrities. Consequently, various\nmethods have been proposed in order to \"erase\" sensitive concepts from\ntext-to-image models. In this work, we examine five recently proposed concept\nerasure methods, and show that targeted concepts are not fully excised from any\nof these methods. Specifically, we leverage the existence of special learned\nword embeddings that can retrieve \"erased\" concepts from the sanitized models\nwith no alterations to their weights. Our results highlight the brittleness of\npost hoc concept erasure methods, and call into question their use in the\nalgorithmic toolkit for AI safety.\n","authors":["Minh Pham","Kelly O. Marshall","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2308.01508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06267v4","updated":"2023-08-03T01:56:35Z","published":"2023-01-16T05:40:42Z","title":"Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with\n  Multimodal Models","summary":"  The ability to quickly learn a new task with minimal instruction - known as\nfew-shot learning - is a central aspect of intelligent agents. Classical\nfew-shot benchmarks make use of few-shot samples from a single modality, but\nsuch samples may not be sufficient to characterize an entire concept class. In\ncontrast, humans use cross-modal information to learn new concepts efficiently.\nIn this work, we demonstrate that one can indeed build a better ${\\bf visual}$\ndog classifier by ${\\bf read}$ing about dogs and ${\\bf listen}$ing to them\nbark. To do so, we exploit the fact that recent multimodal foundation models\nsuch as CLIP are inherently cross-modal, mapping different modalities to the\nsame representation space. Specifically, we propose a simple cross-modal\nadaptation approach that learns from few-shot examples spanning different\nmodalities. By repurposing class names as additional one-shot training samples,\nwe achieve SOTA results with an embarrassingly simple linear classifier for\nvision-language adaptation. Furthermore, we show that our approach can benefit\nexisting methods such as prefix tuning, adapters, and classifier ensembling.\nFinally, to explore other modalities beyond vision and language, we construct\nthe first (to our knowledge) audiovisual few-shot benchmark and use cross-modal\ntraining to improve the performance of both image and audio classification.\n","authors":["Zhiqiu Lin","Samuel Yu","Zhiyi Kuang","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2301.06267v4.pdf","comment":"CVPR 2023. Project website:\n  https://linzhiqiu.github.io/papers/cross_modal/"},{"id":"http://arxiv.org/abs/2011.11233v2","updated":"2023-08-03T01:44:49Z","published":"2020-11-23T06:34:07Z","title":"ROME: Robustifying Memory-Efficient NAS via Topology Disentanglement and\n  Gradient Accumulation","summary":"  Albeit being a prevalent architecture searching approach, differentiable\narchitecture search (DARTS) is largely hindered by its substantial memory cost\nsince the entire supernet resides in the memory. This is where the single-path\nDARTS comes in, which only chooses a single-path submodel at each step. While\nbeing memory-friendly, it also comes with low computational costs. Nonetheless,\nwe discover a critical issue of single-path DARTS that has not been primarily\nnoticed. Namely, it also suffers from severe performance collapse since too\nmany parameter-free operations like skip connections are derived, just like\nDARTS does. In this paper, we propose a new algorithm called RObustifying\nMemory-Efficient NAS (ROME) to give a cure. First, we disentangle the topology\nsearch from the operation search to make searching and evaluation consistent.\nWe then adopt Gumbel-Top2 reparameterization and gradient accumulation to\nrobustify the unwieldy bi-level optimization. We verify ROME extensively across\n15 benchmarks to demonstrate its effectiveness and robustness.\n","authors":["Xiaoxing Wang","Xiangxiang Chu","Yuda Fan","Zhexi Zhang","Bo Zhang","Xiaokang Yang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2011.11233v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.01490v1","updated":"2023-08-03T01:08:53Z","published":"2023-08-03T01:08:53Z","title":"Minimax Optimal $Q$ Learning with Nearest Neighbors","summary":"  $Q$ learning is a popular model free reinforcement learning method. Most of\nexisting works focus on analyzing $Q$ learning for finite state and action\nspaces. If the state space is continuous, then the original $Q$ learning method\ncan not be directly used. A modification of the original $Q$ learning method\nwas proposed in (Shah and Xie, 2018), which estimates $Q$ values with nearest\nneighbors. Such modification makes $Q$ learning suitable for continuous state\nspace. (Shah and Xie, 2018) shows that the convergence rate of estimated $Q$\nfunction is $\\tilde{O}(T^{-1/(d+3)})$, which is slower than the minimax lower\nbound $\\tilde{\\Omega}(T^{-1/(d+2)})$, indicating that this method is not\nefficient. This paper proposes two new $Q$ learning methods to bridge the gap\nof convergence rates in (Shah and Xie, 2018), with one of them being offline,\nwhile the other is online. Despite that we still use nearest neighbor approach\nto estimate $Q$ function, the algorithms are crucially different from (Shah and\nXie, 2018). In particular, we replace the kernel nearest neighbor in\ndiscretized region with a direct nearest neighbor approach. Consequently, our\napproach significantly improves the convergence rate. Moreover, the time\ncomplexity is also significantly improved in high dimensional state spaces. Our\nanalysis shows that both offline and online methods are minimax rate optimal.\n","authors":["Puning Zhao","Lifeng Lai"],"pdf_url":"https://arxiv.org/pdf/2308.01490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01483v1","updated":"2023-08-03T00:42:30Z","published":"2023-08-03T00:42:30Z","title":"Efficient neural supersampling on a novel gaming dataset","summary":"  Real-time rendering for video games has become increasingly challenging due\nto the need for higher resolutions, framerates and photorealism. Supersampling\nhas emerged as an effective solution to address this challenge. Our work\nintroduces a novel neural algorithm for supersampling rendered content that is\n4 times more efficient than existing methods while maintaining the same level\nof accuracy. Additionally, we introduce a new dataset which provides auxiliary\nmodalities such as motion vectors and depth generated using graphics rendering\nfeatures like viewport jittering and mipmap biasing at different resolutions.\nWe believe that this dataset fills a gap in the current dataset landscape and\ncan serve as a valuable resource to help measure progress in the field and\nadvance the state-of-the-art in super-resolution techniques for gaming content.\n","authors":["Antoine Mercier","Ruan Erasmus","Yashesh Savani","Manik Dhingra","Fatih Porikli","Guillaume Berger"],"pdf_url":"https://arxiv.org/pdf/2308.01483v1.pdf","comment":"ICCV'23"},{"id":"http://arxiv.org/abs/2308.01481v1","updated":"2023-08-03T00:21:30Z","published":"2023-08-03T00:21:30Z","title":"Online covariance estimation for stochastic gradient descent under\n  Markovian sampling","summary":"  We study the online overlapping batch-means covariance estimator for\nStochastic Gradient Descent (SGD) under Markovian sampling. We show that the\nconvergence rates of the covariance estimator are\n$O\\big(\\sqrt{d}\\,n^{-1/8}(\\log n)^{1/4}\\big)$ and\n$O\\big(\\sqrt{d}\\,n^{-1/8}\\big)$ under state-dependent and state-independent\nMarkovian sampling, respectively, with $d$ representing dimensionality and $n$\ndenoting the number of observations or SGD iterations. Remarkably, these rates\nmatch the best-known convergence rate previously established for the\nindependent and identically distributed ($\\iid$) case by \\cite{zhu2021online},\nup to logarithmic factors. Our analysis overcomes significant challenges that\narise due to Markovian sampling, leading to the introduction of additional\nerror terms and complex dependencies between the blocks of the batch-means\ncovariance estimator. Moreover, we establish the convergence rate for the first\nfour moments of the $\\ell_2$ norm of the error of SGD dynamics under\nstate-dependent Markovian data, which holds potential interest as an\nindependent result. To validate our theoretical findings, we provide numerical\nillustrations to derive confidence intervals for SGD when training linear and\nlogistic regression models under Markovian sampling. Additionally, we apply our\napproach to tackle the intriguing problem of strategic classification with\nlogistic regression, where adversaries can adaptively modify features during\nthe training process to increase their chances of being classified in a\nspecific target class.\n","authors":["Abhishek Roy","Krishnakumar Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2308.01481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10741v2","updated":"2023-08-03T00:21:03Z","published":"2023-03-19T19:09:41Z","title":"Computer Vision Estimation of Emotion Reaction Intensity in the Wild","summary":"  Emotions play an essential role in human communication. Developing computer\nvision models for automatic recognition of emotion expression can aid in a\nvariety of domains, including robotics, digital behavioral healthcare, and\nmedia analytics. There are three types of emotional representations which are\ntraditionally modeled in affective computing research: Action Units, Valence\nArousal (VA), and Categorical Emotions. As part of an effort to move beyond\nthese representations towards more fine-grained labels, we describe our\nsubmission to the newly introduced Emotional Reaction Intensity (ERI)\nEstimation challenge in the 5th competition for Affective Behavior Analysis\nin-the-Wild (ABAW). We developed four deep neural networks trained in the\nvisual domain and a multimodal model trained with both visual and audio\nfeatures to predict emotion reaction intensity. Our best performing model on\nthe Hume-Reaction dataset achieved an average Pearson correlation coefficient\nof 0.4080 on the test set using a pre-trained ResNet50 model. This work\nprovides a first step towards the development of production-grade models which\npredict emotion reaction intensities rather than discrete emotion categories.\n","authors":["Yang Qian","Ali Kargarandehkordi","Onur Cezmi Mutlu","Saimourya Surabhi","Mohammadmahdi Honarmand","Dennis Paul Wall","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2303.10741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02084v1","updated":"2023-08-03T23:55:17Z","published":"2023-08-03T23:55:17Z","title":"Efficient Model Adaptation for Continual Learning at the Edge","summary":"  Most machine learning (ML) systems assume stationary and matching data\ndistributions during training and deployment. This is often a false assumption.\nWhen ML models are deployed on real devices, data distributions often shift\nover time due to changes in environmental factors, sensor characteristics, and\ntask-of-interest. While it is possible to have a human-in-the-loop to monitor\nfor distribution shifts and engineer new architectures in response to these\nshifts, such a setup is not cost-effective. Instead, non-stationary automated\nML (AutoML) models are needed. This paper presents the\nEncoder-Adaptor-Reconfigurator (EAR) framework for efficient continual learning\nunder domain shifts. The EAR framework uses a fixed deep neural network (DNN)\nfeature encoder and trains shallow networks on top of the encoder to handle\nnovel data. The EAR framework is capable of 1) detecting when new data is\nout-of-distribution (OOD) by combining DNNs with hyperdimensional computing\n(HDC), 2) identifying low-parameter neural adaptors to adapt the model to the\nOOD data using zero-shot neural architecture search (ZS-NAS), and 3) minimizing\ncatastrophic forgetting on previous tasks by progressively growing the neural\narchitecture as needed and dynamically routing data through the appropriate\nadaptors and reconfigurators for handling domain-incremental and\nclass-incremental continual learning. We systematically evaluate our approach\non several benchmark datasets for domain adaptation and demonstrate strong\nperformance compared to state-of-the-art algorithms for OOD detection and\nfew-/zero-shot NAS.\n","authors":["Zachary A. Daniels","Jun Hu","Michael Lomnitz","Phil Miller","Aswin Raghavan","Joe Zhang","Michael Piacentino","David Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02084v1.pdf","comment":"Under Review w/ IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.02081v1","updated":"2023-08-03T23:43:42Z","published":"2023-08-03T23:43:42Z","title":"Target specification bias, counterfactual prediction, and algorithmic\n  fairness in healthcare","summary":"  Bias in applications of machine learning (ML) to healthcare is usually\nattributed to unrepresentative or incomplete data, or to underlying health\ndisparities. This article identifies a more pervasive source of bias that\naffects the clinical utility of ML-enabled prediction tools: target\nspecification bias. Target specification bias arises when the\noperationalization of the target variable does not match its definition by\ndecision makers. The mismatch is often subtle, and stems from the fact that\ndecision makers are typically interested in predicting the outcomes of\ncounterfactual, rather than actual, healthcare scenarios. Target specification\nbias persists independently of data limitations and health disparities. When\nleft uncorrected, it gives rise to an overestimation of predictive accuracy, to\ninefficient utilization of medical resources, and to suboptimal decisions that\ncan harm patients. Recent work in metrology - the science of measurement -\nsuggests ways of counteracting target specification bias and avoiding its\nharmful consequences.\n","authors":["Eran Tal"],"pdf_url":"https://arxiv.org/pdf/2308.02081v1.pdf","comment":"Proceedings of the 2023 AAAI/ACM Conference on AI, Ethics, and\n  Society (AIES23)"},{"id":"http://arxiv.org/abs/2308.02080v1","updated":"2023-08-03T23:39:03Z","published":"2023-08-03T23:39:03Z","title":"Causality Guided Disentanglement for Cross-Platform Hate Speech\n  Detection","summary":"  Social media platforms, despite their value in promoting open discourse, are\noften exploited to spread harmful content. Current deep learning and natural\nlanguage processing models used for detecting this harmful content overly rely\non domain-specific terms affecting their capabilities to adapt to generalizable\nhate speech detection. This is because they tend to focus too narrowly on\nparticular linguistic signals or the use of certain categories of words.\nAnother significant challenge arises when platforms lack high-quality annotated\ndata for training, leading to a need for cross-platform models that can adapt\nto different distribution shifts. Our research introduces a cross-platform hate\nspeech detection model capable of being trained on one platform's data and\ngeneralizing to multiple unseen platforms. To achieve good generalizability\nacross platforms, one way is to disentangle the input representations into\ninvariant and platform-dependent features. We also argue that learning causal\nrelationships, which remain constant across diverse environments, can\nsignificantly aid in understanding invariant representations in hate speech. By\ndisentangling input into platform-dependent features (useful for predicting\nhate targets) and platform-independent features (used to predict the presence\nof hate), we learn invariant representations resistant to distribution shifts.\nThese features are then used to predict hate speech across unseen platforms.\nOur extensive experiments across four platforms highlight our model's enhanced\nefficacy compared to existing state-of-the-art methods in detecting generalized\nhate speech.\n","authors":["Paras Sheth","Tharindu Kumarage","Raha Moraffah","Aman Chadha","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.13687v3","updated":"2023-08-03T22:50:29Z","published":"2022-05-27T00:34:03Z","title":"Statistical Inference of Constrained Stochastic Optimization via\n  Sketched Sequential Quadratic Programming","summary":"  We consider statistical inference of equality-constrained stochastic\nnonlinear optimization problems. We develop a fully online stochastic\nsequential quadratic programming (StoSQP) method to solve the problems, which\ncan be regarded as applying Newton's method to the first-order optimality\nconditions (i.e., the KKT conditions). Motivated by recent designs of numerical\nsecond-order methods, we allow StoSQP to adaptively select any random stepsize\n$\\bar{\\alpha}_t$, as long as $\\beta_t\\leq \\bar{\\alpha}_t \\leq \\beta_t+\\chi_t$,\nfor some control sequences $\\beta_t$ and $\\chi_t=o(\\beta_t)$. To reduce the\ndominant computational cost of second-order methods, we additionally allow\nStoSQP to inexactly solve quadratic programs via efficient randomized iterative\nsolvers that utilize sketching techniques. Notably, we do not require the\napproximation error to diminish as iteration proceeds. For the developed\nmethod, we show that under mild assumptions (i) computationally, it can take at\nmost $O(1/\\epsilon^4)$ iterations (same as samples) to attain\n$\\epsilon$-stationarity; (ii) statistically, its primal-dual sequence\n$1/\\sqrt{\\beta_t}\\cdot (x_t - x^\\star, \\lambda_t - \\lambda^\\star)$ converges to\na mean-zero Gaussian distribution with a nontrivial covariance matrix depending\non the underlying sketching distribution. Additionally, we establish the\nalmost-sure convergence rate of the iterate $(x_t, \\lambda_t)$ along with the\nBerry-Esseen bound; the latter quantitatively measures the convergence rate of\nthe distribution function. We analyze a plug-in limiting covariance matrix\nestimator, and demonstrate the performance of the method both on benchmark\nnonlinear problems in CUTEst test set and on linearly/nonlinearly constrained\nregression problems.\n","authors":["Sen Na","Michael W. Mahoney"],"pdf_url":"https://arxiv.org/pdf/2205.13687v3.pdf","comment":"57 pages, 3 figures, 11 tables"},{"id":"http://arxiv.org/abs/2308.02068v1","updated":"2023-08-03T22:42:30Z","published":"2023-08-03T22:42:30Z","title":"Specious Sites: Tracking the Spread and Sway of Spurious News Stories at\n  Scale","summary":"  Misinformation, propaganda, and outright lies proliferate on the web, with\nsome narratives having dangerous real-world consequences on public health,\nelections, and individual safety. However, despite the impact of\nmisinformation, the research community largely lacks automated and programmatic\napproaches for tracking news narratives across online platforms. In this work,\nutilizing daily scrapes of 1,404 unreliable news websites, the large-language\nmodel MPNet, and DP-Means clustering, we introduce a system to automatically\nisolate and analyze the narratives spread within online ecosystems. Identifying\n55,301 narratives on these 1,404 websites, we describe the most prevalent\nnarratives spread in 2022 and identify the most influential websites that\noriginate and magnify narratives. Finally, we show how our system can be\nutilized to detect new narratives originating from unreliable news websites and\naid fact-checkers like Politifact, Reuters, and AP News in more quickly\naddressing misinformation stories.\n","authors":["Hans W. A. Hanley","Deepak Kumar","Zakir Durumeric"],"pdf_url":"https://arxiv.org/pdf/2308.02068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02066v1","updated":"2023-08-03T22:34:16Z","published":"2023-08-03T22:34:16Z","title":"Mitigating Task Interference in Multi-Task Learning via Explicit Task\n  Routing with Non-Learnable Primitives","summary":"  Multi-task learning (MTL) seeks to learn a single model to accomplish\nmultiple tasks by leveraging shared information among the tasks. Existing MTL\nmodels, however, have been known to suffer from negative interference among\ntasks. Efforts to mitigate task interference have focused on either\nloss/gradient balancing or implicit parameter partitioning with partial\noverlaps among the tasks. In this paper, we propose ETR-NLP to mitigate task\ninterference through a synergistic combination of non-learnable primitives\n(NLPs) and explicit task routing (ETR). Our key idea is to employ non-learnable\nprimitives to extract a diverse set of task-agnostic features and recombine\nthem into a shared branch common to all tasks and explicit task-specific\nbranches reserved for each task. The non-learnable primitives and the explicit\ndecoupling of learnable parameters into shared and task-specific ones afford\nthe flexibility needed for minimizing task interference. We evaluate the\nefficacy of ETR-NLP networks for both image-level classification and\npixel-level dense prediction MTL problems. Experimental results indicate that\nETR-NLP significantly outperforms state-of-the-art baselines with fewer\nlearnable parameters and similar FLOPs across all datasets. Code is available\nat this \\href{https://github.com/zhichao-lu/etr-nlp-mtl}.\n","authors":["Chuntao Ding","Zhichao Lu","Shangguang Wang","Ran Cheng","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2308.02066v1.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2308.02065v1","updated":"2023-08-03T22:21:04Z","published":"2023-08-03T22:21:04Z","title":"On the Biometric Capacity of Generative Face Models","summary":"  There has been tremendous progress in generating realistic faces with high\nfidelity over the past few years. Despite this progress, a crucial question\nremains unanswered: \"Given a generative face model, how many unique identities\ncan it generate?\" In other words, what is the biometric capacity of the\ngenerative face model? A scientific basis for answering this question will\nbenefit evaluating and comparing different generative face models and establish\nan upper bound on their scalability. This paper proposes a statistical approach\nto estimate the biometric capacity of generated face images in a hyperspherical\nfeature space. We employ our approach on multiple generative models, including\nunconditional generators like StyleGAN, Latent Diffusion Model, and \"Generated\nPhotos,\" as well as DCFace, a class-conditional generator. We also estimate\ncapacity w.r.t. demographic attributes such as gender and age. Our capacity\nestimates indicate that (a) under ArcFace representation at a false acceptance\nrate (FAR) of 0.1%, StyleGAN3 and DCFace have a capacity upper bound of\n$1.43\\times10^6$ and $1.190\\times10^4$, respectively; (b) the capacity reduces\ndrastically as we lower the desired FAR with an estimate of $1.796\\times10^4$\nand $562$ at FAR of 1% and 10%, respectively, for StyleGAN3; (c) there is no\ndiscernible disparity in the capacity w.r.t gender; and (d) for some generative\nmodels, there is an appreciable disparity in the capacity w.r.t age. Code is\navailable at https://github.com/human-analysis/capacity-generative-face-models.\n","authors":["Vishnu Naresh Boddeti","Gautam Sreekumar","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2308.02065v1.pdf","comment":"IJCB 2023"},{"id":"http://arxiv.org/abs/2308.02060v1","updated":"2023-08-03T21:49:14Z","published":"2023-08-03T21:49:14Z","title":"Accurate Neural Network Pruning Requires Rethinking Sparse Optimization","summary":"  Obtaining versions of deep neural networks that are both highly-accurate and\nhighly-sparse is one of the main challenges in the area of model compression,\nand several high-performance pruning techniques have been investigated by the\ncommunity. Yet, much less is known about the interaction between sparsity and\nthe standard stochastic optimization techniques used for training sparse\nnetworks, and most existing work uses standard dense schedules and\nhyperparameters for training sparse networks. In this work, we examine the\nimpact of high sparsity on model training using the standard computer vision\nand natural language processing sparsity benchmarks. We begin by showing that\nusing standard dense training recipes for sparse training is suboptimal, and\nresults in under-training. We provide new approaches for mitigating this issue\nfor both sparse pre-training of vision models (e.g. ResNet50/ImageNet) and\nsparse fine-tuning of language models (e.g. BERT/GLUE), achieving\nstate-of-the-art results in both settings in the high-sparsity regime, and\nproviding detailed analyses for the difficulty of sparse training in both\nscenarios. Our work sets a new threshold in terms of the accuracies that can be\nachieved under high sparsity, and should inspire further research into\nimproving sparse model training, to reach higher accuracies under high\nsparsity, but also to do so efficiently.\n","authors":["Denis Kuznedelev","Eldar Kurtic","Eugenia Iofinova","Elias Frantar","Alexandra Peste","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2308.02060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16104v2","updated":"2023-08-03T21:35:36Z","published":"2023-07-30T01:49:21Z","title":"AI Increases Global Access to Reliable Flood Forecasts","summary":"  Floods are one of the most common and impactful natural disasters, with a\ndisproportionate impact in developing countries that often lack dense\nstreamflow monitoring networks. Accurate and timely warnings are critical for\nmitigating flood risks, but accurate hydrological simulation models typically\nmust be calibrated to long data records in each watershed where they are\napplied. We developed an Artificial Intelligence (AI) model to predict extreme\nhydrological events at timescales up to 7 days in advance. This model\nsignificantly outperforms current state of the art global hydrology models (the\nCopernicus Emergency Management Service Global Flood Awareness System) across\nall continents, lead times, and return periods. AI is especially effective at\nforecasting in ungauged basins, which is important because only a few percent\nof the world's watersheds have stream gauges, with a disproportionate number of\nungauged basins in developing countries that are especially vulnerable to the\nhuman impacts of flooding. We produce forecasts of extreme events in South\nAmerica and Africa that achieve reliability approaching the current state of\nthe art in Europe and North America, and we achieve reliability at between 4\nand 6-day lead times that are similar to current state of the art nowcasts\n(0-day lead time). Additionally, we achieve accuracies over 10-year return\nperiod events that are similar to current accuracies over 2-year return period\nevents, meaning that AI can provide warnings earlier and over larger and more\nimpactful events. The model that we develop in this paper has been incorporated\ninto an operational early warning system that produces publicly available (free\nand open) forecasts in real time in over 80 countries. This work using AI and\nopen data highlights a need for increasing the availability of hydrological\ndata to continue to improve global access to reliable flood warnings.\n","authors":["Grey Nearing","Deborah Cohen","Vusumuzi Dube","Martin Gauch","Oren Gilon","Shaun Harrigan","Avinatan Hassidim","Frederik Kratzert","Asher Metzger","Sella Nevo","Florian Pappenberger","Christel Prudhomme","Guy Shalev","Shlomo Shenzis","Tadele Tekalign","Dana Weitzner","Yoss Matias"],"pdf_url":"https://arxiv.org/pdf/2307.16104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02058v1","updated":"2023-08-03T21:34:00Z","published":"2023-08-03T21:34:00Z","title":"Incorporating Recklessness to Collaborative Filtering based Recommender\n  Systems","summary":"  Recommender systems that include some reliability measure of their\npredictions tend to be more conservative in forecasting, due to their\nconstraint to preserve reliability. This leads to a significant drop in the\ncoverage and novelty that these systems can provide. In this paper, we propose\nthe inclusion of a new term in the learning process of matrix\nfactorization-based recommender systems, called recklessness, which enables the\ncontrol of the risk level desired when making decisions about the reliability\nof a prediction. Experimental results demonstrate that recklessness not only\nallows for risk regulation but also improves the quantity and quality of\npredictions provided by the recommender system.\n","authors":["Diego Pérez-López","Fernando Ortega","Ángel González-Prieto","Jorge Dueñas-Lerín"],"pdf_url":"https://arxiv.org/pdf/2308.02058v1.pdf","comment":"15 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.02055v1","updated":"2023-08-03T21:14:25Z","published":"2023-08-03T21:14:25Z","title":"Seasonality Based Reranking of E-commerce Autocomplete Using Natural\n  Language Queries","summary":"  Query autocomplete (QAC) also known as typeahead, suggests list of complete\nqueries as user types prefix in the search box. It is one of the key features\nof modern search engines specially in e-commerce. One of the goals of typeahead\nis to suggest relevant queries to users which are seasonally important. In this\npaper we propose a neural network based natural language processing (NLP)\nalgorithm to incorporate seasonality as a signal and present end to end\nevaluation of the QAC ranking model. Incorporating seasonality into\nautocomplete ranking model can improve autocomplete relevance and business\nmetric.\n","authors":["Prateek Verma","Shan Zhong","Xiaoyu Liu","Adithya Rajan"],"pdf_url":"https://arxiv.org/pdf/2308.02055v1.pdf","comment":"Accepted at The 6th Workshop on e-Commerce and NLP (ECNLP 6), KDD'23,\n  Long Beach, CA"},{"id":"http://arxiv.org/abs/2308.02054v1","updated":"2023-08-03T21:13:34Z","published":"2023-08-03T21:13:34Z","title":"Robust Independence Tests with Finite Sample Guarantees for Synchronous\n  Stochastic Linear Systems","summary":"  The paper introduces robust independence tests with non-asymptotically\nguaranteed significance levels for stochastic linear time-invariant systems,\nassuming that the observed outputs are synchronous, which means that the\nsystems are driven by jointly i.i.d. noises. Our method provides bounds for the\ntype I error probabilities that are distribution-free, i.e., the innovations\ncan have arbitrary distributions. The algorithm combines confidence region\nestimates with permutation tests and general dependence measures, such as the\nHilbert-Schmidt independence criterion and the distance covariance, to detect\nany nonlinear dependence between the observed systems. We also prove the\nconsistency of our hypothesis tests under mild assumptions and demonstrate the\nideas through the example of autoregressive systems.\n","authors":["Ambrus Tamás","Dániel Ágoston Bálint","Balázs Csanád Csáji"],"pdf_url":"https://arxiv.org/pdf/2308.02054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02051v1","updated":"2023-08-03T21:09:59Z","published":"2023-08-03T21:09:59Z","title":"A Graphical Approach to Document Layout Analysis","summary":"  Document layout analysis (DLA) is the task of detecting the distinct,\nsemantic content within a document and correctly classifying these items into\nan appropriate category (e.g., text, title, figure). DLA pipelines enable users\nto convert documents into structured machine-readable formats that can then be\nused for many useful downstream tasks. Most existing state-of-the-art (SOTA)\nDLA models represent documents as images, discarding the rich metadata\navailable in electronically generated PDFs. Directly leveraging this metadata,\nwe represent each PDF page as a structured graph and frame the DLA problem as a\ngraph segmentation and classification problem. We introduce the Graph-based\nLayout Analysis Model (GLAM), a lightweight graph neural network competitive\nwith SOTA models on two challenging DLA datasets - while being an order of\nmagnitude smaller than existing models. In particular, the 4-million parameter\nGLAM model outperforms the leading 140M+ parameter computer vision-based model\non 5 of the 11 classes on the DocLayNet dataset. A simple ensemble of these two\nmodels achieves a new state-of-the-art on DocLayNet, increasing mAP from 76.8\nto 80.8. Overall, GLAM is over 5 times more efficient than SOTA models, making\nGLAM a favorable engineering choice for DLA tasks.\n","authors":["Jilin Wang","Michael Krumdick","Baojia Tong","Hamima Halim","Maxim Sokolov","Vadym Barda","Delphine Vendryes","Chris Tanner"],"pdf_url":"https://arxiv.org/pdf/2308.02051v1.pdf","comment":"ICDAR 2023"},{"id":"http://arxiv.org/abs/2308.02050v1","updated":"2023-08-03T21:08:16Z","published":"2023-08-03T21:08:16Z","title":"FuNToM: Functional Modeling of RF Circuits Using a Neural Network\n  Assisted Two-Port Analysis Method","summary":"  Automatic synthesis of analog and Radio Frequency (RF) circuits is a trending\napproach that requires an efficient circuit modeling method. This is due to the\nexpensive cost of running a large number of simulations at each synthesis\ncycle. Artificial intelligence methods are promising approaches for circuit\nmodeling due to their speed and relative accuracy. However, existing approaches\nrequire a large amount of training data, which is still collected using\nsimulation runs. In addition, such approaches collect a whole separate dataset\nfor each circuit topology even if a single element is added or removed. These\nmatters are only exacerbated by the need for post-layout modeling simulations,\nwhich take even longer. To alleviate these drawbacks, in this paper, we present\nFuNToM, a functional modeling method for RF circuits. FuNToM leverages the\ntwo-port analysis method for modeling multiple topologies using a single main\ndataset and multiple small datasets. It also leverages neural networks which\nhave shown promising results in predicting the behavior of circuits. Our\nresults show that for multiple RF circuits, in comparison to the\nstate-of-the-art works, while maintaining the same accuracy, the required\ntraining data is reduced by 2.8x - 10.9x. In addition, FuNToM needs 176.8x -\n188.6x less time for collecting the training set in post-layout modeling.\n","authors":["Morteza Fayazi","Morteza Tavakoli Taba","Amirata Tabatabavakili","Ehsan Afshari","Ronald Dreslinski"],"pdf_url":"https://arxiv.org/pdf/2308.02050v1.pdf","comment":"8 pages, 13 figures, 8 tables, accepted on International Conference\n  on Computer-Aided Design (ICCAD)"},{"id":"http://arxiv.org/abs/2308.02029v1","updated":"2023-08-03T20:45:11Z","published":"2023-08-03T20:45:11Z","title":"Deep Maxout Network-based Feature Fusion and Political Tangent Search\n  Optimizer enabled Transfer Learning for Thalassemia Detection","summary":"  Thalassemia is a heritable blood disorder which is the outcome of a genetic\ndefect causing lack of production of hemoglobin polypeptide chains. However,\nthere is less understanding of the precise frequency as well as sharing in\nthese areas. Knowing about the frequency of thalassemia occurrence and\ndependable mutations is thus a significant step in preventing, controlling, and\ntreatment planning. Here, Political Tangent Search Optimizer based Transfer\nLearning (PTSO_TL) is introduced for thalassemia detection. Initially, input\ndata obtained from a particular dataset is normalized in the data normalization\nstage. Quantile normalization is utilized in the data normalization stage, and\nthe data are then passed to the feature fusion phase, in which Weighted\nEuclidean Distance with Deep Maxout Network (DMN) is utilized. Thereafter, data\naugmentation is performed using the oversampling method to increase data\ndimensionality. Lastly, thalassemia detection is carried out by TL, wherein a\nconvolutional neural network (CNN) is utilized with hyperparameters from a\ntrained model such as Xception. TL is tuned by PTSO, and the training algorithm\nPTSO is presented by merging of Political Optimizer (PO) and Tangent Search\nAlgorithm (TSA). Furthermore, PTSO_TL obtained maximal precision, recall, and\nf-measure values of about 94.3%, 96.1%, and 95.2%, respectively.\n","authors":["Hemn Barzan Abdalla","Awder Ahmed","Guoquan Li","Nasser Mustafa","Abdur Rashid Sangi"],"pdf_url":"https://arxiv.org/pdf/2308.02029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02013v1","updated":"2023-08-03T20:08:23Z","published":"2023-08-03T20:08:23Z","title":"Federated Representation Learning for Automatic Speech Recognition","summary":"  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge\ndevices to learn collaboratively without sharing data. Edge devices like Alexa\nand Siri are prospective sources of unlabeled audio data that can be tapped to\nlearn robust audio representations. In this work, we bring Self-supervised\nLearning (SSL) and FL together to learn representations for Automatic Speech\nRecognition respecting data privacy constraints. We use the speaker and chapter\ninformation in the unlabeled speech dataset, Libri-Light, to simulate non-IID\nspeaker-siloed data distributions and pre-train an LSTM encoder with the\nContrastive Predictive Coding framework with FedSGD. We show that the\npre-trained ASR encoder in FL performs as well as a centrally pre-trained model\nand produces an improvement of 12-15% (WER) compared to no pre-training. We\nfurther adapt the federated pre-trained models to a new language, French, and\nshow a 20% (WER) improvement over no pre-training.\n","authors":["Guruprasad V Rames","Gopinath Chennupati","Milind Rao","Anit Kumar Sahu","Ariya Rastrow","Jasha Droppo"],"pdf_url":"https://arxiv.org/pdf/2308.02013v1.pdf","comment":"Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy\n  in Speech Communication, 2023"},{"id":"http://arxiv.org/abs/2308.02001v1","updated":"2023-08-03T19:31:15Z","published":"2023-08-03T19:31:15Z","title":"Memory capacity of two layer neural networks with smooth activations","summary":"  Determining the memory capacity of two-layer neural networks with m hidden\nneurons and input dimension d (i.e., md+m total trainable parameters), which\nrefers to the largest size of general data the network can memorize, is a\nfundamental machine-learning question. For non-polynomial real analytic\nactivation functions, such as sigmoids and smoothed rectified linear units\n(smoothed ReLUs), we establish a lower bound of md/2 and optimality up to a\nfactor of approximately 2. Analogous prior results were limited to Heaviside\nand ReLU activations, with results for smooth activations suffering from\nlogarithmic factors and requiring random data. To analyze the memory capacity,\nwe examine the rank of the network's Jacobian by computing the rank of matrices\ninvolving both Hadamard powers and the Khati-Rao product. Our computation\nextends classical linear algebraic facts about the rank of Hadamard powers.\nOverall, our approach differs from previous works on memory capacity and holds\npromise for extending to deeper models and other architectures.\n","authors":["Liam Madden","Christos Thrampoulidis"],"pdf_url":"https://arxiv.org/pdf/2308.02001v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2308.02000v1","updated":"2023-08-03T19:29:35Z","published":"2023-08-03T19:29:35Z","title":"On the Transition from Neural Representation to Symbolic Knowledge","summary":"  Bridging the huge disparity between neural and symbolic representation can\npotentially enable the incorporation of symbolic thinking into neural networks\nfrom essence. Motivated by how human gradually builds complex symbolic\nrepresentation from the prototype symbols that are learned through perception\nand environmental interactions. We propose a Neural-Symbolic Transitional\nDictionary Learning (TDL) framework that employs an EM algorithm to learn a\ntransitional representation of data that compresses high-dimension information\nof visual parts of an input into a set of tensors as neural variables and\ndiscover the implicit predicate structure in a self-supervised way. We\nimplement the framework with a diffusion model by regarding the decomposition\nof input as a cooperative game, then learn predicates by prototype clustering.\nWe additionally use RL enabled by the Markovian of diffusion models to further\ntune the learned prototypes by incorporating subjective factors. Extensive\nexperiments on 3 abstract compositional visual objects datasets that require\nthe model to segment parts without any visual features like texture, color, or\nshadows apart from shape and 3 neural/symbolic downstream tasks demonstrate the\nlearned representation enables interpretable decomposition of visual input and\nsmooth adaption to downstream tasks which are not available by existing\nmethods.\n","authors":["Junyan Cheng","Peter Chin"],"pdf_url":"https://arxiv.org/pdf/2308.02000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01119v2","updated":"2023-08-03T19:27:16Z","published":"2023-08-02T12:59:10Z","title":"Unlearning Spurious Correlations in Chest X-ray Classification","summary":"  Medical image classification models are frequently trained using training\ndatasets derived from multiple data sources. While leveraging multiple data\nsources is crucial for achieving model generalization, it is important to\nacknowledge that the diverse nature of these sources inherently introduces\nunintended confounders and other challenges that can impact both model accuracy\nand transparency. A notable confounding factor in medical image classification,\nparticularly in musculoskeletal image classification, is skeletal\nmaturation-induced bone growth observed during adolescence. We train a deep\nlearning model using a Covid-19 chest X-ray dataset and we showcase how this\ndataset can lead to spurious correlations due to unintended confounding\nregions. eXplanation Based Learning (XBL) is a deep learning approach that goes\nbeyond interpretability by utilizing model explanations to interactively\nunlearn spurious correlations. This is achieved by integrating interactive user\nfeedback, specifically feature annotations. In our study, we employed two\nnon-demanding manual feedback mechanisms to implement an XBL-based approach for\neffectively eliminating these spurious correlations. Our results underscore the\npromising potential of XBL in constructing robust models even in the presence\nof confounding factors.\n","authors":["Misgina Tsighe Hagos","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2308.01119v2.pdf","comment":"Accepted at the Discovery Science 2023 conference. arXiv admin note:\n  text overlap with arXiv:2307.06026"},{"id":"http://arxiv.org/abs/2208.07243v3","updated":"2023-08-03T19:22:21Z","published":"2022-08-15T14:57:26Z","title":"Exponential Concentration of Stochastic Approximation with Non-vanishing\n  Gradient","summary":"  We analyze the behavior of stochastic approximation algorithms where\niterates, in expectation, make progress towards an objective at each step. When\nprogress is proportional to the step size of the algorithm, we prove\nexponential concentration bounds. These tail-bounds contrast asymptotic\nnormality results which are more frequently associated with stochastic\napproximation. The methods that we develop rely on a geometric ergodicity\nproof. This extends a result on Markov chains due to Hajek (1982) to the area\nof stochastic approximation algorithms. For Projected Stochastic Gradient\nDescent with a non-vanishing gradient, our results can be used to prove\n$O(1/t)$ and linear convergence rates.\n","authors":["Kody Law","Neil Walton","Shangda Yang"],"pdf_url":"https://arxiv.org/pdf/2208.07243v3.pdf","comment":"20 pages, 6 Figures"},{"id":"http://arxiv.org/abs/2308.01994v1","updated":"2023-08-03T19:13:48Z","published":"2023-08-03T19:13:48Z","title":"Explainable unsupervised multi-modal image registration using deep\n  networks","summary":"  Clinical decision making from magnetic resonance imaging (MRI) combines\ncomplementary information from multiple MRI sequences (defined as\n'modalities'). MRI image registration aims to geometrically 'pair' diagnoses\nfrom different modalities, time points and slices. Both intra- and\ninter-modality MRI registration are essential components in clinical MRI\nsettings. Further, an MRI image processing pipeline that can address both afine\nand non-rigid registration is critical, as both types of deformations may be\noccuring in real MRI data scenarios. Unlike image classification,\nexplainability is not commonly addressed in image registration deep learning\n(DL) methods, as it is challenging to interpet model-data behaviours against\ntransformation fields. To properly address this, we incorporate Grad-CAM-based\nexplainability frameworks in each major component of our unsupervised\nmulti-modal and multi-organ image registration DL methodology. We previously\ndemonstrated that we were able to reach superior performance (against the\ncurrent standard Syn method). In this work, we show that our DL model becomes\nfully explainable, setting the framework to generalise our approach on further\nmedical imaging data.\n","authors":["Chengjia Wang","Giorgos Papanastasiou"],"pdf_url":"https://arxiv.org/pdf/2308.01994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01981v1","updated":"2023-08-03T18:28:50Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n  morphometrics","summary":"  We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v1.pdf","comment":"To be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.01976v1","updated":"2023-08-03T18:11:00Z","published":"2023-08-03T18:11:00Z","title":"Domain specificity and data efficiency in typo tolerant spell checkers:\n  the case of search in online marketplaces","summary":"  Typographical errors are a major source of frustration for visitors of online\nmarketplaces. Because of the domain-specific nature of these marketplaces and\nthe very short queries users tend to search for, traditional spell cheking\nsolutions do not perform well in correcting typos. We present a data\naugmentation method to address the lack of annotated typo data and train a\nrecurrent neural network to learn context-limited domain-specific embeddings.\nThose embeddings are deployed in a real-time inferencing API for the Microsoft\nAppSource marketplace to find the closest match between a misspelled user query\nand the available product names. Our data efficient solution shows that\ncontrolled high quality synthetic data may be a powerful tool especially\nconsidering the current climate of large language models which rely on\nprohibitively huge and often uncontrolled datasets.\n","authors":["Dayananda Ubrangala","Juhi Sharma","Ravi Prasad Kondapalli","Kiran R","Amit Agarwala","Laurent Boué"],"pdf_url":"https://arxiv.org/pdf/2308.01976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01954v1","updated":"2023-08-03T13:00:50Z","published":"2023-08-03T13:00:50Z","title":"Bringing Chemistry to Scale: Loss Weight Adjustment for Multivariate\n  Regression in Deep Learning of Thermochemical Processes","summary":"  Flamelet models are widely used in computational fluid dynamics to simulate\nthermochemical processes in turbulent combustion. These models typically employ\nmemory-expensive lookup tables that are predetermined and represent the\ncombustion process to be simulated. Artificial neural networks (ANNs) offer a\ndeep learning approach that can store this tabular data using a small number of\nnetwork weights, potentially reducing the memory demands of complex simulations\nby orders of magnitude. However, ANNs with standard training losses often\nstruggle with underrepresented targets in multivariate regression tasks, e.g.,\nwhen learning minor species mass fractions as part of lookup tables. This paper\nseeks to improve the accuracy of an ANN when learning multiple species mass\nfractions of a hydrogen (\\ce{H2}) combustion lookup table. We assess a simple,\nyet effective loss weight adjustment that outperforms the standard mean-squared\nerror optimization and enables accurate learning of all species mass fractions,\neven of minor species where the standard optimization completely fails.\nFurthermore, we find that the loss weight adjustment leads to more balanced\ngradients in the network training, which explains its effectiveness.\n","authors":["Franz M. Rohrhofer","Stefan Posch","Clemens Gößnitzer","José M. García-Oliver","Bernhard C. Geiger"],"pdf_url":"https://arxiv.org/pdf/2308.01954v1.pdf","comment":"8 pages. Part of Scientific Computing 2023 Conference Proceedings\n  (ISBN e-Book: 978-3-903318-20-5)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.13501v3","updated":"2023-08-03T13:51:22Z","published":"2023-05-22T21:38:06Z","title":"LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On","summary":"  The rapidly evolving fields of e-commerce and metaverse continue to seek\ninnovative approaches to enhance the consumer experience. At the same time,\nrecent advancements in the development of diffusion models have enabled\ngenerative networks to create remarkably realistic images. In this context,\nimage-based virtual try-on, which consists in generating a novel image of a\ntarget model wearing a given in-shop garment, has yet to capitalize on the\npotential of these powerful generative solutions. This work introduces\nLaDI-VTON, the first Latent Diffusion textual Inversion-enhanced model for the\nVirtual Try-ON task. The proposed architecture relies on a latent diffusion\nmodel extended with a novel additional autoencoder module that exploits\nlearnable skip connections to enhance the generation process preserving the\nmodel's characteristics. To effectively maintain the texture and details of the\nin-shop garment, we propose a textual inversion component that can map the\nvisual features of the garment to the CLIP token embedding space and thus\ngenerate a set of pseudo-word token embeddings capable of conditioning the\ngeneration process. Experimental results on Dress Code and VITON-HD datasets\ndemonstrate that our approach outperforms the competitors by a consistent\nmargin, achieving a significant milestone for the task. Source code and trained\nmodels are publicly available at: https://github.com/miccunifi/ladi-vton.\n","authors":["Davide Morelli","Alberto Baldrati","Giuseppe Cartella","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2305.13501v3.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2305.12726v2","updated":"2023-08-03T09:26:36Z","published":"2023-05-22T05:20:23Z","title":"Towards Explainable In-the-Wild Video Quality Assessment: A Database and\n  a Language-Prompted Approach","summary":"  The proliferation of in-the-wild videos has greatly expanded the Video\nQuality Assessment (VQA) problem. Unlike early definitions that usually focus\non limited distortion types, VQA on in-the-wild videos is especially\nchallenging as it could be affected by complicated factors, including various\ndistortions and diverse contents. Though subjective studies have collected\noverall quality scores for these videos, how the abstract quality scores relate\nwith specific factors is still obscure, hindering VQA methods from more\nconcrete quality evaluations (e.g. sharpness of a video). To solve this\nproblem, we collect over two million opinions on 4,543 in-the-wild videos on 13\ndimensions of quality-related factors, including in-capture authentic\ndistortions (e.g. motion blur, noise, flicker), errors introduced by\ncompression and transmission, and higher-level experiences on semantic contents\nand aesthetic issues (e.g. composition, camera trajectory), to establish the\nmulti-dimensional Maxwell database. Specifically, we ask the subjects to label\namong a positive, a negative, and a neutral choice for each dimension. These\nexplanation-level opinions allow us to measure the relationships between\nspecific quality factors and abstract subjective quality ratings, and to\nbenchmark different categories of VQA algorithms on each dimension, so as to\nmore comprehensively analyze their strengths and weaknesses. Furthermore, we\npropose the MaxVQA, a language-prompted VQA approach that modifies\nvision-language foundation model CLIP to better capture important quality\nissues as observed in our analyses. The MaxVQA can jointly evaluate various\nspecific quality factors and final quality scores with state-of-the-art\naccuracy on all dimensions, and superb generalization ability on existing\ndatasets. Code and data available at https://github.com/VQAssessment/MaxVQA.\n","authors":["Haoning Wu","Erli Zhang","Liang Liao","Chaofeng Chen","Jingwen Hou","Annan Wang","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2305.12726v2.pdf","comment":"Proceedings of the 31st ACM International Conference on Multimedia\n  (MM '23)"},{"id":"http://arxiv.org/abs/2308.01634v1","updated":"2023-08-03T09:09:28Z","published":"2023-08-03T09:09:28Z","title":"Disentangling Multi-view Representations Beyond Inductive Bias","summary":"  Multi-view (or -modality) representation learning aims to understand the\nrelationships between different view representations. Existing methods\ndisentangle multi-view representations into consistent and view-specific\nrepresentations by introducing strong inductive biases, which can limit their\ngeneralization ability. In this paper, we propose a novel multi-view\nrepresentation disentangling method that aims to go beyond inductive biases,\nensuring both interpretability and generalizability of the resulting\nrepresentations. Our method is based on the observation that discovering\nmulti-view consistency in advance can determine the disentangling information\nboundary, leading to a decoupled learning objective. We also found that the\nconsistency can be easily extracted by maximizing the transformation invariance\nand clustering consistency between views. These observations drive us to\npropose a two-stage framework. In the first stage, we obtain multi-view\nconsistency by training a consistent encoder to produce semantically-consistent\nrepresentations across views as well as their corresponding pseudo-labels. In\nthe second stage, we disentangle specificity from comprehensive representations\nby minimizing the upper bound of mutual information between consistent and\ncomprehensive representations. Finally, we reconstruct the original data by\nconcatenating pseudo-labels and view-specific representations. Our experiments\non four multi-view datasets demonstrate that our proposed method outperforms 12\ncomparison methods in terms of clustering and classification performance. The\nvisualization results also show that the extracted consistency and specificity\nare compact and interpretable. Our code can be found at\n\\url{https://github.com/Guanzhou-Ke/DMRIB}.\n","authors":["Guanzhou Ke","Yang Yu","Guoqing Chao","Xiaoli Wang"," Chenyang"," Xu","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.01634v1.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.01546v1","updated":"2023-08-03T05:35:37Z","published":"2023-08-03T05:35:37Z","title":"MusicLDM: Enhancing Novelty in Text-to-Music Generation Using\n  Beat-Synchronous Mixup Strategies","summary":"  Diffusion models have shown promising results in cross-modal generation\ntasks, including text-to-image and text-to-audio generation. However,\ngenerating music, as a special type of audio, presents unique challenges due to\nlimited availability of music data and sensitive issues related to copyright\nand plagiarism. In this paper, to tackle these challenges, we first construct a\nstate-of-the-art text-to-music model, MusicLDM, that adapts Stable Diffusion\nand AudioLDM architectures to the music domain. We achieve this by retraining\nthe contrastive language-audio pretraining model (CLAP) and the Hifi-GAN\nvocoder, as components of MusicLDM, on a collection of music data samples.\nThen, to address the limitations of training data and to avoid plagiarism, we\nleverage a beat tracking model and propose two different mixup strategies for\ndata augmentation: beat-synchronous audio mixup and beat-synchronous latent\nmixup, which recombine training audio directly or via a latent embeddings\nspace, respectively. Such mixup strategies encourage the model to interpolate\nbetween musical training samples and generate new music within the convex hull\nof the training data, making the generated music more diverse while still\nstaying faithful to the corresponding style. In addition to popular evaluation\nmetrics, we design several new evaluation metrics based on CLAP score to\ndemonstrate that our proposed MusicLDM and beat-synchronous mixup strategies\nimprove both the quality and novelty of generated music, as well as the\ncorrespondence between input text and generated music.\n","authors":["Ke Chen","Yusong Wu","Haohe Liu","Marianna Nezhurina","Taylor Berg-Kirkpatrick","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2308.01546v1.pdf","comment":"16 pages, 3 figures, 2 tables, demo page: https://musicldm.github.io/"},{"id":"http://arxiv.org/abs/2308.01537v1","updated":"2023-08-03T04:45:44Z","published":"2023-08-03T04:45:44Z","title":"Learning Causality-inspired Representation Consistency for Video Anomaly\n  Detection","summary":"  Video anomaly detection is an essential yet challenging task in the\nmultimedia community, with promising applications in smart cities and secure\ncommunities. Existing methods attempt to learn abstract representations of\nregular events with statistical dependence to model the endogenous normality,\nwhich discriminates anomalies by measuring the deviations to the learned\ndistribution. However, conventional representation learning is only a crude\ndescription of video normality and lacks an exploration of its underlying\ncausality. The learned statistical dependence is unreliable for diverse regular\nevents in the real world and may cause high false alarms due to\novergeneralization. Inspired by causal representation learning, we think that\nthere exists a causal variable capable of adequately representing the general\npatterns of regular events in which anomalies will present significant\nvariations. Therefore, we design a causality-inspired representation\nconsistency (CRC) framework to implicitly learn the unobservable causal\nvariables of normality directly from available normal videos and detect\nabnormal events with the learned representation consistency. Extensive\nexperiments show that the causality-inspired normality is robust to regular\nevents with label-independent shifts, and the proposed CRC framework can\nquickly and accurately detect various complicated anomalies from real-world\nsurveillance videos.\n","authors":["Yang Liu","Zhaoyang Xia","Mengyang Zhao","Donglai Wei","Yuzheng Wang","Liu Siao","Bobo Ju","Gaoyun Fang","Jing Liu","Liang Song"],"pdf_url":"https://arxiv.org/pdf/2308.01537v1.pdf","comment":"Accepted to ACM MM 2023"}]},"2023-08-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.02490v1","updated":"2023-08-04T17:59:47Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":"  We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models. Code and data are\navailable at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v1.pdf","comment":"Code and data: https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2308.02482v1","updated":"2023-08-04T17:54:52Z","published":"2023-08-04T17:54:52Z","title":"Adapting the NICT-JLE Corpus for Disfluency Detection Models","summary":"  The detection of disfluencies such as hesitations, repetitions and false\nstarts commonly found in speech is a widely studied area of research. With a\nstandardised process for evaluation using the Switchboard Corpus, model\nperformance can be easily compared across approaches. This is not the case for\ndisfluency detection research on learner speech, however, where such datasets\nhave restricted access policies, making comparison and subsequent development\nof improved models more challenging. To address this issue, this paper\ndescribes the adaptation of the NICT-JLE corpus, containing approximately 300\nhours of English learners' oral proficiency tests, to a format that is suitable\nfor disfluency detection model training and evaluation. Points of difference\nbetween the NICT-JLE and Switchboard corpora are explored, followed by a\ndetailed overview of adaptations to the tag set and meta-features of the\nNICT-JLE corpus. The result of this work provides a standardised train, heldout\nand test set for use in future research on disfluency detection for learner\nspeech.\n","authors":["Lucy Skidmore","Roger K. Moore"],"pdf_url":"https://arxiv.org/pdf/2308.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01236v2","updated":"2023-08-04T17:51:57Z","published":"2023-08-02T15:44:36Z","title":"Grounded Image Text Matching with Mismatched Relation Reasoning","summary":"  This paper introduces Grounded Image Text Matching with Mismatched Relation\n(GITM-MR), a novel visual-linguistic joint task that evaluates the relation\nunderstanding capabilities of transformer-based pre-trained models. GITM-MR\nrequires a model to first determine if an expression describes an image, then\nlocalize referred objects or ground the mismatched parts of the text. We\nprovide a benchmark for evaluating pre-trained models on this task, with a\nfocus on the challenging settings of limited data and out-of-distribution\nsentence lengths. Our evaluation demonstrates that pre-trained models lack data\nefficiency and length generalization ability. To address this, we propose the\nRelation-sensitive Correspondence Reasoning Network (RCRN), which incorporates\nrelation-aware reasoning via bi-directional message propagation guided by\nlanguage structure. RCRN can be interpreted as a modular program and delivers\nstrong performance in both length generalization and data efficiency.\n","authors":["Yu Wu","Yana Wei","Haozhe Wang","Yongfei Liu","Sibei Yang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2308.01236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02463v1","updated":"2023-08-04T17:00:38Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":"  In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of data, model design, and evaluation thoroughly. Our\ncontribution can be concluded as follows: (i), we construct a large-scale\nMedical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.\nTo the best of our knowledge, this is the first multi-modal dataset containing\n3D medical scans. (ii), We propose an architecture that enables visually\nconditioned generative pre-training, allowing for the integration of text input\ninterleaved with 2D or 3D medical scans to generate response for diverse\nradiologic tasks. The model was initially pre-trained on MedMD and subsequently\ndomain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,\ncontaining 3M radiologic visual-language pairs. (iii), we propose a new\nevaluation benchmark that comprises five tasks, aiming to comprehensively\nassess the capability of foundation models in handling practical clinical\nproblems. Our experimental results confirm that RadFM significantly outperforms\nexisting multi-modal foundation models. The codes, data, and model checkpoint\nwill all be made publicly available to promote further research and development\nin the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02448v1","updated":"2023-08-04T16:22:06Z","published":"2023-08-04T16:22:06Z","title":"From Military to Healthcare: Adopting and Expanding Ethical Principles\n  for Generative Artificial Intelligence","summary":"  In 2020, the U.S. Department of Defense officially disclosed a set of ethical\nprinciples to guide the use of Artificial Intelligence (AI) technologies on\nfuture battlefields. Despite stark differences, there are core similarities\nbetween the military and medical service. Warriors on battlefields often face\nlife-altering circumstances that require quick decision-making. Medical\nproviders experience similar challenges in a rapidly changing healthcare\nenvironment, such as in the emergency department or during surgery treating a\nlife-threatening condition. Generative AI, an emerging technology designed to\nefficiently generate valuable information, holds great promise. As computing\npower becomes more accessible and the abundance of health data, such as\nelectronic health records, electrocardiograms, and medical images, increases,\nit is inevitable that healthcare will be revolutionized by this technology.\nRecently, generative AI has captivated the research community, leading to\ndebates about its application in healthcare, mainly due to concerns about\ntransparency and related issues. Meanwhile, concerns about the potential\nexacerbation of health disparities due to modeling biases have raised notable\nethical concerns regarding the use of this technology in healthcare. However,\nthe ethical principles for generative AI in healthcare have been understudied,\nand decision-makers often fail to consider the significance of generative AI.\nIn this paper, we propose GREAT PLEA ethical principles, encompassing\ngovernance, reliability, equity, accountability, traceability, privacy,\nlawfulness, empathy, and autonomy, for generative AI in healthcare. We aim to\nproactively address the ethical dilemmas and challenges posed by the\nintegration of generative AI in healthcare.\n","authors":["David Oniani","Jordan Hilsman","Yifan Peng"," COL","Ronald K. Poropatich","COL Jeremy C. Pamplin","LTC Gary L. Legault","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17910v2","updated":"2023-08-04T16:19:24Z","published":"2023-03-31T09:16:13Z","title":"Selective Knowledge Distillation for Non-Autoregressive Neural Machine\n  Translation","summary":"  Benefiting from the sequence-level knowledge distillation, the\nNon-Autoregressive Transformer (NAT) achieves great success in neural machine\ntranslation tasks. However, existing knowledge distillation has side effects,\nsuch as propagating errors from the teacher to NAT students, which may limit\nfurther improvements of NAT models and are rarely discussed in existing\nresearch. In this paper, we introduce selective knowledge distillation by\nintroducing an NAT evaluator to select NAT-friendly targets that are of high\nquality and easy to learn. In addition, we introduce a simple yet effective\nprogressive distillation method to boost NAT performance. Experiment results on\nmultiple WMT language directions and several representative NAT models show\nthat our approach can realize a flexible trade-off between the quality and\ncomplexity of training data for NAT models, achieving strong performances.\nFurther analysis shows that distilling only 5% of the raw translations can help\nan NAT outperform its counterpart trained on raw data by about 2.4 BLEU.\n","authors":["Min Liu","Yu Bao","Chengqi Zhao","Shujian Huang"],"pdf_url":"https://arxiv.org/pdf/2303.17910v2.pdf","comment":"Accepted to AAAI 2023"},{"id":"http://arxiv.org/abs/2305.19148v3","updated":"2023-08-04T15:43:19Z","published":"2023-05-28T15:37:39Z","title":"Mitigating Label Biases for In-context Learning","summary":"  Various design settings for in-context learning (ICL), such as the choice and\norder of the in-context examples, can bias a model toward a particular\nprediction without being reflective of an understanding of the task. While many\nstudies discuss these design choices, there have been few systematic\ninvestigations into categorizing them and mitigating their impact. In this\nwork, we define a typology for three types of label biases in ICL for text\nclassification: vanilla-label bias, context-label bias, and domain-label bias\n(which we conceptualize and detect for the first time).\n  Our analysis demonstrates that prior label bias calibration methods fall\nshort of addressing all three types of biases. Specifically, domain-label bias\nrestricts LLMs to random-level performance on many tasks regardless of the\nchoice of in-context examples. To mitigate the effect of these biases, we\npropose a simple bias calibration method that estimates a language model's\nlabel bias using random in-domain words from the task corpus. After controlling\nfor this estimated bias when making predictions, our novel domain-context\ncalibration significantly improves the ICL performance of GPT-J and GPT-3 on a\nwide range of tasks. The gain is substantial on tasks with large domain-label\nbias (up to 37% in Macro-F1). Furthermore, our results generalize to models\nwith different scales, pretraining methods, and manually-designed task\ninstructions, showing the prevalence of label biases in ICL.\n","authors":["Yu Fei","Yifan Hou","Zeming Chen","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2305.19148v3.pdf","comment":"Accepted to ACL 2023"},{"id":"http://arxiv.org/abs/2308.02357v1","updated":"2023-08-04T14:47:15Z","published":"2023-08-04T14:47:15Z","title":"Text2KGBench: A Benchmark for Ontology-Driven Knowledge Graph Generation\n  from Text","summary":"  The recent advances in large language models (LLM) and foundation models with\nemergent capabilities have been shown to improve the performance of many NLP\ntasks. LLMs and Knowledge Graphs (KG) can complement each other such that LLMs\ncan be used for KG construction or completion while existing KGs can be used\nfor different tasks such as making LLM outputs explainable or fact-checking in\nNeuro-Symbolic manner. In this paper, we present Text2KGBench, a benchmark to\nevaluate the capabilities of language models to generate KGs from natural\nlanguage text guided by an ontology. Given an input ontology and a set of\nsentences, the task is to extract facts from the text while complying with the\ngiven ontology (concepts, relations, domain/range constraints) and being\nfaithful to the input sentences. We provide two datasets (i) Wikidata-TekGen\nwith 10 ontologies and 13,474 sentences and (ii) DBpedia-WebNLG with 19\nontologies and 4,860 sentences. We define seven evaluation metrics to measure\nfact extraction performance, ontology conformance, and hallucinations by LLMs.\nFurthermore, we provide results for two baseline models, Vicuna-13B and\nAlpaca-LoRA-13B using automatic prompt generation from test cases. The baseline\nresults show that there is room for improvement using both Semantic Web and\nNatural Language Processing techniques.\n","authors":["Nandana Mihindukulasooriya","Sanju Tiwari","Carlos F. Enguix","Kusum Lata"],"pdf_url":"https://arxiv.org/pdf/2308.02357v1.pdf","comment":"15 pages, 3 figures, 4 tables. Accepted at ISWC 2023 (Resources\n  Track)"},{"id":"http://arxiv.org/abs/2308.02323v1","updated":"2023-08-04T13:40:54Z","published":"2023-08-04T13:40:54Z","title":"Dataflow Dialogue Generation","summary":"  We demonstrate task-oriented dialogue generation within the dataflow dialogue\nparadigm. We show an example of agenda driven dialogue generation for the\nMultiWOZ domain, and an example of generation without an agenda for the\nSMCalFlow domain, where we show an improvement in the accuracy of the\ntranslation of user requests to dataflow expressions when the generated\ndialogues are used to augment the translation training dataset.\n","authors":["Joram Meron","Victor Guimarães"],"pdf_url":"https://arxiv.org/pdf/2308.02323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02294v1","updated":"2023-08-04T12:59:39Z","published":"2023-08-04T12:59:39Z","title":"Learning to Select the Relevant History Turns in Conversational Question\n  Answering","summary":"  The increasing demand for the web-based digital assistants has given a rapid\nrise in the interest of the Information Retrieval (IR) community towards the\nfield of conversational question answering (ConvQA). However, one of the\ncritical aspects of ConvQA is the effective selection of conversational history\nturns to answer the question at hand. The dependency between relevant history\nselection and correct answer prediction is an intriguing but under-explored\narea. The selected relevant context can better guide the system so as to where\nexactly in the passage to look for an answer. Irrelevant context, on the other\nhand, brings noise to the system, thereby resulting in a decline in the model's\nperformance. In this paper, we propose a framework, DHS-ConvQA (Dynamic History\nSelection in Conversational Question Answering), that first generates the\ncontext and question entities for all the history turns, which are then pruned\non the basis of similarity they share in common with the question at hand. We\nalso propose an attention-based mechanism to re-rank the pruned terms based on\ntheir calculated weights of how useful they are in answering the question. In\nthe end, we further aid the model by highlighting the terms in the re-ranked\nconversational history using a binary classification task and keeping the\nuseful terms (predicted as 1) and ignoring the irrelevant terms (predicted as\n0). We demonstrate the efficacy of our proposed framework with extensive\nexperimental results on CANARD and QuAC -- the two popularly utilized datasets\nin ConvQA. We demonstrate that selecting relevant turns works better than\nrewriting the original question. We also investigate how adding the irrelevant\nhistory turns negatively impacts the model's performance and discuss the\nresearch challenges that demand more attention from the IR community.\n","authors":["Munazza Zaib","Wei Emma Zhang","Quan Z. Sheng","Subhash Sagar","Adnan Mahmood","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02270v1","updated":"2023-08-04T11:47:19Z","published":"2023-08-04T11:47:19Z","title":"Redundancy Aware Multi-Reference Based Gainwise Evaluation of Extractive\n  Summarization","summary":"  While very popular for evaluating extractive summarization task, the ROUGE\nmetric has long been criticized for its lack of semantic awareness and its\nignorance about the ranking quality of the summarizer. Thanks to previous\nresearch that has addressed these issues by proposing a gain-based automated\nmetric called Sem-nCG, which is both rank and semantic aware. However, Sem-nCG\ndoes not consider the amount of redundancy present in a model-generated summary\nand currently does not support evaluation with multiple reference summaries.\nUnfortunately, addressing both these limitations simultaneously is not trivial.\nTherefore, in this paper, we propose a redundancy-aware Sem-nCG metric and\ndemonstrate how this new metric can be used to evaluate model summaries against\nmultiple references. We also explore different ways of incorporating redundancy\ninto the original metric through extensive experiments. Experimental results\ndemonstrate that the new redundancy-aware metric exhibits a higher correlation\nwith human judgments than the original Sem-nCG metric for both single and\nmultiple reference scenarios.\n","authors":["Mousumi Akter","Shubhra Kanti Karmaker Santu"],"pdf_url":"https://arxiv.org/pdf/2308.02270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02263v1","updated":"2023-08-04T11:39:29Z","published":"2023-08-04T11:39:29Z","title":"Efficient Monaural Speech Enhancement using Spectrum Attention Fusion","summary":"  Speech enhancement is a demanding task in automated speech processing\npipelines, focusing on separating clean speech from noisy channels. Transformer\nbased models have recently bested RNN and CNN models in speech enhancement,\nhowever at the same time they are much more computationally expensive and\nrequire much more high quality training data, which is always hard to come by.\nIn this paper, we present an improvement for speech enhancement models that\nmaintains the expressiveness of self-attention while significantly reducing\nmodel complexity, which we have termed Spectrum Attention Fusion. We carefully\nconstruct a convolutional module to replace several self-attention layers in a\nspeech Transformer, allowing the model to more efficiently fuse spectral\nfeatures. Our proposed model is able to achieve comparable or better results\nagainst SOTA models but with significantly smaller parameters (0.58M) on the\nVoice Bank + DEMAND dataset.\n","authors":["Jinyu Long","Jetic Gū","Binhao Bai","Zhibo Yang","Ping Wei","Junli Li"],"pdf_url":"https://arxiv.org/pdf/2308.02263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.03251v2","updated":"2023-08-04T11:12:09Z","published":"2022-04-07T06:50:37Z","title":"Automatic WordNet Construction using Word Sense Induction through\n  Sentence Embeddings","summary":"  Language resources such as wordnets remain indispensable tools for different\nnatural language tasks and applications. However, for low-resource languages\nsuch as Filipino, existing wordnets are old and outdated, and producing new\nones may be slow and costly in terms of time and resources. In this paper, we\npropose an automatic method for constructing a wordnet from scratch using only\nan unlabeled corpus and a sentence embeddings-based language model. Using this,\nwe produce FilWordNet, a new wordnet that supplants and improves the outdated\nFilipino WordNet. We evaluate our automatically-induced senses and synsets by\nmatching them with senses from the Princeton WordNet, as well as comparing the\nsynsets to the old Filipino WordNet. We empirically show that our method can\ninduce existing, as well as potentially new, senses and synsets automatically\nwithout the need for human supervision.\n","authors":["Dan John Velasco","Axel Alba","Trisha Gail Pelagio","Bryce Anthony Ramirez","Jan Christian Blaise Cruz","Charibeth Cheng"],"pdf_url":"https://arxiv.org/pdf/2204.03251v2.pdf","comment":"10 pages, 8 figures, 1 table; updated with more experiments and\n  evaluation"},{"id":"http://arxiv.org/abs/2307.14850v3","updated":"2023-08-04T11:11:32Z","published":"2023-07-27T13:28:31Z","title":"Turkish Native Language Identification","summary":"  In this paper, we present the first application of Native Language\nIdentification (NLI) for the Turkish language. NLI involves predicting the\nwriter's first language by analysing their writing in different languages.\nWhile most NLI research has focused on English, our study extends its scope to\nTurkish. We used the recently constructed Turkish Learner Corpus and employed a\ncombination of three syntactic features (CFG production rules, part-of-speech\nn-grams, and function words) with L2 texts to demonstrate their effectiveness\nin this task.\n","authors":["Ahmet Yavuz Uluslu","Gerold Schneider"],"pdf_url":"https://arxiv.org/pdf/2307.14850v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07992v2","updated":"2023-08-04T10:25:35Z","published":"2023-03-14T15:46:28Z","title":"Can ChatGPT Replace Traditional KBQA Models? An In-depth Analysis of GPT\n  family LLMs' Question Answering Performance","summary":"  ChatGPT is a powerful large language model (LLM) that covers knowledge\nresources such as Wikipedia and supports natural language question answering\nusing its own knowledge. Therefore, there is growing interest in exploring\nwhether ChatGPT can replace traditional knowledge-based question answering\n(KBQA) models. Although there have been some works analyzing the question\nanswering performance of ChatGPT, there is still a lack of large-scale,\ncomprehensive testing of various types of complex questions to analyze the\nlimitations of the model. In this paper, we present a framework that follows\nthe black-box testing specifications of CheckList proposed by Ribeiro et. al.\nWe evaluate ChatGPT and its family of LLMs on eight real-world KB-based complex\nquestion answering datasets, which include six English datasets and two\nmultilingual datasets. The total number of test cases is approximately 190,000.\nIn addition to the GPT family of LLMs, we also evaluate the well-known FLAN-T5\nto identify commonalities between the GPT family and other LLMs. The dataset\nand code are available at\nhttps://github.com/tan92hl/Complex-Question-Answering-Evaluation-of-GPT-family.git\n","authors":["Yiming Tan","Dehai Min","Yu Li","Wenbo Li","Nan Hu","Yongrui Chen","Guilin Qi"],"pdf_url":"https://arxiv.org/pdf/2303.07992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02234v1","updated":"2023-08-04T10:21:35Z","published":"2023-08-04T10:21:35Z","title":"Sinhala-English Parallel Word Dictionary Dataset","summary":"  Parallel datasets are vital for performing and evaluating any kind of\nmultilingual task. However, in the cases where one of the considered language\npairs is a low-resource language, the existing top-down parallel data such as\ncorpora are lacking in both tally and quality due to the dearth of human\nannotation. Therefore, for low-resource languages, it is more feasible to move\nin the bottom-up direction where finer granular pairs such as dictionary\ndatasets are developed first. They may then be used for mid-level tasks such as\nsupervised multilingual word embedding alignment. These in turn can later guide\nhigher-level tasks in the order of aligning sentence or paragraph text corpora\nused for Machine Translation (MT). Even though more approachable than\ngenerating and aligning a massive corpus for a low-resource language, for the\nsame reason of apathy from larger research entities, even these finer granular\ndata sets are lacking for some low-resource languages. We have observed that\nthere is no free and open dictionary data set for the low-resource language,\nSinhala. Thus, in this work, we introduce three parallel English-Sinhala word\ndictionaries (En-Si-dict-large, En-Si-dict-filtered, En-Si-dict-FastText) which\nhelp in multilingual Natural Language Processing (NLP) tasks related to English\nand Sinhala languages. In this paper, we explain the dataset creation pipeline\nas well as the experimental results of the tests we have carried out to verify\nthe quality of the data sets. The data sets and the related scripts are\navailable at https://github.com/kasunw22/sinhala-para-dict.\n","authors":["Kasun Wickramasinghe","Nisansa de Silva"],"pdf_url":"https://arxiv.org/pdf/2308.02234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02226v1","updated":"2023-08-04T09:43:37Z","published":"2023-08-04T09:43:37Z","title":"Learning to Paraphrase Sentences to Different Complexity Levels","summary":"  While sentence simplification is an active research topic in NLP, its\nadjacent tasks of sentence complexification and same-level paraphrasing are\nnot. To train models on all three tasks, we present two new unsupervised\ndatasets. We compare these datasets, one labeled by a weak classifier and the\nother by a rule-based approach, with a single supervised dataset. Using these\nthree datasets for training, we perform extensive experiments on both\nmultitasking and prompting strategies. Compared to other systems trained on\nunsupervised parallel data, models trained on our weak classifier labeled\ndataset achieve state-of-the-art performance on the ASSET simplification\nbenchmark. Our models also outperform previous work on sentence level\ntargeting. Finally, we establish how a handful of Large Language Models perform\non these tasks under a zero-shot setting.\n","authors":["Alison Chi","Li-Kuang Chen","Yi-Chen Chang","Shu-Hui Lee","Jason S. Chang"],"pdf_url":"https://arxiv.org/pdf/2308.02226v1.pdf","comment":"This arXiv version is a pre-MIT Press publication version, this paper\n  has been accepted by TACL. 22 pages, 3 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.02223v1","updated":"2023-08-04T09:35:45Z","published":"2023-08-04T09:35:45Z","title":"ESRL: Efficient Sampling-based Reinforcement Learning for Sequence\n  Generation","summary":"  Applying Reinforcement Learning (RL) to sequence generation models enables\nthe direct optimization of long-term rewards (\\textit{e.g.,} BLEU and human\nfeedback), but typically requires large-scale sampling over a space of action\nsequences. This is a computational challenge as presented by the practice of\nsequence generation problems, such as machine translation, where we often deal\nwith a large action space (\\textit{e.g.,} a vocabulary) and a long action\nsequence (\\textit{e.g.,} a translation). In this work, we introduce two-stage\nsampling and dynamic sampling approaches to improve the sampling efficiency\nduring training sequence generation models via RL. We experiment with our\napproaches on the traditional sequence generation tasks, including machine\ntranslation and abstractive summarization. Furthermore, we evaluate our\napproaches in RL from human feedback (RLHF) through training a large language\nmodel using the reward model. Experimental results show that the efficient\nsampling-based RL, referred to as ESRL, can outperform all baselines in terms\nof both training efficiency and memory consumption. Notably, ESRL yields\nconsistent performance gains over the strong REINFORCE, minimum risk training,\nand proximal policy optimization methods.\n","authors":["Chenglong Wang","Hang Zhou","Yimin Hu","Yifu Huo","Bei Li","Tongran Liu","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.02223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02199v1","updated":"2023-08-04T08:33:07Z","published":"2023-08-04T08:33:07Z","title":"A Survey of Spanish Clinical Language Models","summary":"  This survey focuses in encoder Language Models for solving tasks in the\nclinical domain in the Spanish language. We review the contributions of 17\ncorpora focused mainly in clinical tasks, then list the most relevant Spanish\nLanguage Models and Spanish Clinical Language models. We perform a thorough\ncomparison of these models by benchmarking them over a curated subset of the\navailable corpora, in order to find the best-performing ones; in total more\nthan 3000 models were fine-tuned for this study. All the tested corpora and the\nbest models are made publically available in an accessible way, so that the\nresults can be reproduced by independent teams or challenged in the future when\nnew Spanish Clinical Language models are created.\n","authors":["Guillem García Subies","Álvaro Barbero Jiménez","Paloma Martínez Fernández"],"pdf_url":"https://arxiv.org/pdf/2308.02199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02193v1","updated":"2023-08-04T08:17:52Z","published":"2023-08-04T08:17:52Z","title":"Explaining Relation Classification Models with Semantic Extents","summary":"  In recent years, the development of large pretrained language models, such as\nBERT and GPT, significantly improved information extraction systems on various\ntasks, including relation classification. State-of-the-art systems are highly\naccurate on scientific benchmarks. A lack of explainability is currently a\ncomplicating factor in many real-world applications. Comprehensible systems are\nnecessary to prevent biased, counterintuitive, or harmful decisions.\n  We introduce semantic extents, a concept to analyze decision patterns for the\nrelation classification task. Semantic extents are the most influential parts\nof texts concerning classification decisions. Our definition allows similar\nprocedures to determine semantic extents for humans and models. We provide an\nannotation tool and a software framework to determine semantic extents for\nhumans and models conveniently and reproducibly. Comparing both reveals that\nmodels tend to learn shortcut patterns from data. These patterns are hard to\ndetect with current interpretability methods, such as input reductions. Our\napproach can help detect and eliminate spurious decision patterns during model\ndevelopment. Semantic extents can increase the reliability and security of\nnatural language processing systems. Semantic extents are an essential step in\nenabling applications in critical areas like healthcare or finance. Moreover,\nour work opens new research directions for developing methods to explain deep\nlearning models.\n","authors":["Lars Klöser","Andre Büsgen","Philipp Kohl","Bodo Kraft","Albert Zündorf"],"pdf_url":"https://arxiv.org/pdf/2308.02193v1.pdf","comment":"Accepted at DeLTA 2023: Deep Learning Theory and Applications\n  conference"},{"id":"http://arxiv.org/abs/2308.02190v1","updated":"2023-08-04T08:15:17Z","published":"2023-08-04T08:15:17Z","title":"Emo-DNA: Emotion Decoupling and Alignment Learning for Cross-Corpus\n  Speech Emotion Recognition","summary":"  Cross-corpus speech emotion recognition (SER) seeks to generalize the ability\nof inferring speech emotion from a well-labeled corpus to an unlabeled one,\nwhich is a rather challenging task due to the significant discrepancy between\ntwo corpora. Existing methods, typically based on unsupervised domain\nadaptation (UDA), struggle to learn corpus-invariant features by global\ndistribution alignment, but unfortunately, the resulting features are mixed\nwith corpus-specific features or not class-discriminative. To tackle these\nchallenges, we propose a novel Emotion Decoupling aNd Alignment learning\nframework (EMO-DNA) for cross-corpus SER, a novel UDA method to learn\nemotion-relevant corpus-invariant features. The novelties of EMO-DNA are\ntwo-fold: contrastive emotion decoupling and dual-level emotion alignment. On\none hand, our contrastive emotion decoupling achieves decoupling learning via a\ncontrastive decoupling loss to strengthen the separability of emotion-relevant\nfeatures from corpus-specific ones. On the other hand, our dual-level emotion\nalignment introduces an adaptive threshold pseudo-labeling to select confident\ntarget samples for class-level alignment, and performs corpus-level alignment\nto jointly guide model for learning class-discriminative corpus-invariant\nfeatures across corpora. Extensive experimental results demonstrate the\nsuperior performance of EMO-DNA over the state-of-the-art methods in several\ncross-corpus scenarios. Source code is available at\nhttps://github.com/Jiaxin-Ye/Emo-DNA.\n","authors":["Jiaxin Ye","Yujie Wei","Xin-Cheng Wen","Chenglong Ma","Zhizhong Huang","Kunhong Liu","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2308.02190v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.02185v1","updated":"2023-08-04T07:58:48Z","published":"2023-08-04T07:58:48Z","title":"From Fake to Hyperpartisan News Detection Using Domain Adaptation","summary":"  Unsupervised Domain Adaptation (UDA) is a popular technique that aims to\nreduce the domain shift between two data distributions. It was successfully\napplied in computer vision and natural language processing. In the current\nwork, we explore the effects of various unsupervised domain adaptation\ntechniques between two text classification tasks: fake and hyperpartisan news\ndetection. We investigate the knowledge transfer from fake to hyperpartisan\nnews detection without involving target labels during training. Thus, we\nevaluate UDA, cluster alignment with a teacher, and cross-domain contrastive\nlearning. Extensive experiments show that these techniques improve performance,\nwhile including data augmentation further enhances the results. In addition, we\ncombine clustering and topic modeling algorithms with UDA, resulting in\nimproved performances compared to the initial UDA setup.\n","authors":["Răzvan-Alexandru Smădu","Sebastian-Vasile Echim","Dumitru-Clementin Cercel","Iuliana Marin","Florin Pop"],"pdf_url":"https://arxiv.org/pdf/2308.02185v1.pdf","comment":"15 pages, 3 figures, Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.02180v1","updated":"2023-08-04T07:51:15Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n  Study in Oncology","summary":"  Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zheng","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v1.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n  (MLHC) 2023"},{"id":"http://arxiv.org/abs/2303.13035v3","updated":"2023-08-04T07:49:26Z","published":"2023-03-23T04:47:46Z","title":"SPeC: A Soft Prompt-Based Calibration on Performance Variability of\n  Large Language Model in Clinical Notes Summarization","summary":"  Electronic health records (EHRs) store an extensive array of patient\ninformation, encompassing medical histories, diagnoses, treatments, and test\noutcomes. These records are crucial for enabling healthcare providers to make\nwell-informed decisions regarding patient care. Summarizing clinical notes\nfurther assists healthcare professionals in pinpointing potential health risks\nand making better-informed decisions. This process contributes to reducing\nerrors and enhancing patient outcomes by ensuring providers have access to the\nmost pertinent and current patient data. Recent research has shown that\nincorporating prompts with large language models (LLMs) substantially boosts\nthe efficacy of summarization tasks. However, we show that this approach also\nleads to increased output variance, resulting in notably divergent outputs even\nwhen prompts share similar meanings. To tackle this challenge, we introduce a\nmodel-agnostic Soft Prompt-Based Calibration (SPeC) pipeline that employs soft\nprompts to diminish variance while preserving the advantages of prompt-based\nsummarization. Experimental findings on multiple clinical note tasks and LLMs\nindicate that our method not only bolsters performance but also effectively\ncurbs variance for various LLMs, providing a more uniform and dependable\nsolution for summarizing vital medical information.\n","authors":["Yu-Neng Chuang","Ruixiang Tang","Xiaoqian Jiang","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2303.13035v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02168v1","updated":"2023-08-04T07:10:15Z","published":"2023-08-04T07:10:15Z","title":"You talk what you read: Understanding News Comment Behavior by\n  Dispositional and Situational Attribution","summary":"  Many news comment mining studies are based on the assumption that comment is\nexplicitly linked to the corresponding news. In this paper, we observed that\nusers' comments are also heavily influenced by their individual characteristics\nembodied by the interaction history. Therefore, we position to understand news\ncomment behavior by considering both the dispositional factors from news\ninteraction history, and the situational factors from corresponding news. A\nthree-part encoder-decoder framework is proposed to model the generative\nprocess of news comment. The resultant dispositional and situational\nattribution contributes to understanding user focus and opinions, which are\nvalidated in applications of reader-aware news summarization and news\naspect-opinion forecasting.\n","authors":["Yuhang Wang","Yuxiang Zhang","Dongyuan Lu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2308.02168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02160v1","updated":"2023-08-04T06:37:34Z","published":"2023-08-04T06:37:34Z","title":"Speaker Diarization of Scripted Audiovisual Content","summary":"  The media localization industry usually requires a verbatim script of the\nfinal film or TV production in order to create subtitles or dubbing scripts in\na foreign language. In particular, the verbatim script (i.e. as-broadcast\nscript) must be structured into a sequence of dialogue lines each including\ntime codes, speaker name and transcript. Current speech recognition technology\nalleviates the transcription step. However, state-of-the-art speaker\ndiarization models still fall short on TV shows for two main reasons: (i) their\ninability to track a large number of speakers, (ii) their low accuracy in\ndetecting frequent speaker changes. To mitigate this problem, we present a\nnovel approach to leverage production scripts used during the shooting process,\nto extract pseudo-labeled data for the speaker diarization task. We propose a\nnovel semi-supervised approach and demonstrate improvements of 51.7% relative\nto two unsupervised baseline models on our metrics on a 66 show test set.\n","authors":["Yogesh Virkar","Brian Thompson","Rohit Paturi","Sundararajan Srinivasan","Marcello Federico"],"pdf_url":"https://arxiv.org/pdf/2308.02160v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.00925v3","updated":"2023-08-04T06:15:41Z","published":"2023-07-03T10:53:05Z","title":"Automatic Design of Semantic Similarity Ensembles Using Grammatical\n  Evolution","summary":"  Semantic similarity measures are widely used in natural language processing\nto catalyze various computer-related tasks. However, no single semantic\nsimilarity measure is the most appropriate for all tasks, and researchers often\nuse ensemble strategies to ensure performance. This research work proposes a\nmethod for automatically designing semantic similarity ensembles. In fact, our\nproposed method uses grammatical evolution, for the first time, to\nautomatically select and aggregate measures from a pool of candidates to create\nan ensemble that maximizes correlation to human judgment. The method is\nevaluated on several benchmark datasets and compared to state-of-the-art\nensembles, showing that it can significantly improve similarity assessment\naccuracy and outperform existing methods in some cases. As a result, our\nresearch demonstrates the potential of using grammatical evolution to\nautomatically compare text and prove the benefits of using ensembles for\nsemantic similarity tasks. The source code that illustrates our approach can be\ndownloaded from https://github.com/jorge-martinez-gil/sesige.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.00925v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.02151v1","updated":"2023-08-04T06:14:23Z","published":"2023-08-04T06:14:23Z","title":"Retroformer: Retrospective Large Language Agents with Policy Gradient\n  Optimization","summary":"  Recent months have seen the emergence of a powerful new trend in which large\nlanguage models (LLMs) are augmented to become autonomous language agents\ncapable of performing objective oriented multi-step tasks on their own, rather\nthan merely responding to queries from human users. Most existing language\nagents, however, are not optimized using environment-specific rewards. Although\nsome agents enable iterative refinement through verbal feedback, they do not\nreason and plan in ways that are compatible with gradient-based learning from\nrewards. This paper introduces a principled framework for reinforcing large\nlanguage agents by learning a retrospective model, which automatically tunes\nthe language agent prompts from environment feedback through policy gradient.\nSpecifically, our proposed agent architecture learns from rewards across\nmultiple environments and tasks, for fine-tuning a pre-trained language model\nwhich refines the language agent prompt by summarizing the root cause of prior\nfailed attempts and proposing action plans. Experimental results on various\ntasks demonstrate that the language agents improve over time and that our\napproach considerably outperforms baselines that do not properly leverage\ngradients from the environment. This demonstrates that using policy gradient\noptimization to improve language agents, for which we believe our work is one\nof the first, seems promising and can be applied to optimize other models in\nthe agent architecture to enhance agent performances over time.\n","authors":["Weiran Yao","Shelby Heinecke","Juan Carlos Niebles","Zhiwei Liu","Yihao Feng","Le Xue","Rithesh Murthy","Zeyuan Chen","Jianguo Zhang","Devansh Arpit","Ran Xu","Phil Mui","Huan Wang","Caiming Xiong","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2308.02151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12680v2","updated":"2023-08-04T06:07:49Z","published":"2023-05-22T03:35:00Z","title":"G3Detector: General GPT-Generated Text Detector","summary":"  The burgeoning progress in the field of Large Language Models (LLMs) heralds\nsignificant benefits due to their unparalleled capacities. However, it is\ncritical to acknowledge the potential misuse of these models, which could give\nrise to a spectrum of social and ethical dilemmas. Despite numerous preceding\nefforts centered around distinguishing synthetic text, most existing detection\nsystems fail to identify data synthesized by the latest LLMs, such as ChatGPT\nand GPT-4. In response to this challenge, we introduce an unpretentious yet\npotent detection approach proficient in identifying synthetic text across a\nwide array of fields. Moreover, our detector demonstrates outstanding\nperformance uniformly across various model architectures and decoding\nstrategies. It also possesses the capability to identify text generated\nutilizing a potent detection-evasion technique. Our comprehensive research\nunderlines our commitment to boosting the robustness and efficiency of\nmachine-generated text detection mechanisms, particularly in the context of\nswiftly progressing and increasingly adaptive AI technologies.\n","authors":["Haolan Zhan","Xuanli He","Qiongkai Xu","Yuxiang Wu","Pontus Stenetorp"],"pdf_url":"https://arxiv.org/pdf/2305.12680v2.pdf","comment":"Encounter some tech bugs, need to refresh corresponding results"},{"id":"http://arxiv.org/abs/2308.02142v1","updated":"2023-08-04T05:39:26Z","published":"2023-08-04T05:39:26Z","title":"Tweet Insights: A Visualization Platform to Extract Temporal Insights\n  from Twitter","summary":"  This paper introduces a large collection of time series data derived from\nTwitter, postprocessed using word embedding techniques, as well as specialized\nfine-tuned language models. This data comprises the past five years and\ncaptures changes in n-gram frequency, similarity, sentiment and topic\ndistribution. The interface built on top of this data enables temporal analysis\nfor detecting and characterizing shifts in meaning, including complementary\ninformation to trending metrics, such as sentiment and topic association over\ntime. We release an online demo for easy experimentation, and we share code and\nthe underlying aggregated data for future work. In this paper, we also discuss\nthree case studies unlocked thanks to our platform, showcasing its potential\nfor temporal linguistic analysis.\n","authors":["Daniel Loureiro","Kiamehr Rezaee","Talayeh Riahi","Francesco Barbieri","Leonardo Neves","Luis Espinosa Anke","Jose Camacho-Collados"],"pdf_url":"https://arxiv.org/pdf/2308.02142v1.pdf","comment":"Demo paper. Visualization platform available at\n  https://tweetnlp.org/insights"},{"id":"http://arxiv.org/abs/2308.02122v1","updated":"2023-08-04T03:48:28Z","published":"2023-08-04T03:48:28Z","title":"ParaFuzz: An Interpretability-Driven Technique for Detecting Poisoned\n  Samples in NLP","summary":"  Backdoor attacks have emerged as a prominent threat to natural language\nprocessing (NLP) models, where the presence of specific triggers in the input\ncan lead poisoned models to misclassify these inputs to predetermined target\nclasses. Current detection mechanisms are limited by their inability to address\nmore covert backdoor strategies, such as style-based attacks. In this work, we\npropose an innovative test-time poisoned sample detection framework that hinges\non the interpretability of model predictions, grounded in the semantic meaning\nof inputs. We contend that triggers (e.g., infrequent words) are not supposed\nto fundamentally alter the underlying semantic meanings of poisoned samples as\nthey want to stay stealthy. Based on this observation, we hypothesize that\nwhile the model's predictions for paraphrased clean samples should remain\nstable, predictions for poisoned samples should revert to their true labels\nupon the mutations applied to triggers during the paraphrasing process. We\nemploy ChatGPT, a state-of-the-art large language model, as our paraphraser and\nformulate the trigger-removal task as a prompt engineering problem. We adopt\nfuzzing, a technique commonly used for unearthing software vulnerabilities, to\ndiscover optimal paraphrase prompts that can effectively eliminate triggers\nwhile concurrently maintaining input semantics. Experiments on 4 types of\nbackdoor attacks, including the subtle style backdoors, and 4 distinct datasets\ndemonstrate that our approach surpasses baseline methods, including STRIP, RAP,\nand ONION, in precision and recall.\n","authors":["Lu Yan","Zhuo Zhang","Guanhong Tao","Kaiyuan Zhang","Xuan Chen","Guangyu Shen","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02113v1","updated":"2023-08-04T02:20:56Z","published":"2023-08-04T02:20:56Z","title":"Chinese Financial Text Emotion Mining: GCGTS -- A Character\n  Relationship-based Approach for Simultaneous Aspect-Opinion Pair Extraction","summary":"  Aspect-Opinion Pair Extraction (AOPE) from Chinese financial texts is a\nspecialized task in fine-grained text sentiment analysis. The main objective is\nto extract aspect terms and opinion terms simultaneously from a diverse range\nof financial texts. Previous studies have mainly focused on developing grid\nannotation schemes within grid-based models to facilitate this extraction\nprocess. However, these methods often rely on character-level (token-level)\nfeature encoding, which may overlook the logical relationships between Chinese\ncharacters within words. To address this limitation, we propose a novel method\ncalled Graph-based Character-level Grid Tagging Scheme (GCGTS). The GCGTS\nmethod explicitly incorporates syntactic structure using Graph Convolutional\nNetworks (GCN) and unifies the encoding of characters within the same syntactic\nsemantic unit (Chinese word level). Additionally, we introduce an image\nconvolutional structure into the grid model to better capture the local\nrelationships between characters within evaluation units. This innovative\nstructure reduces the excessive reliance on pre-trained language models and\nemphasizes the modeling of structure and local relationships, thereby improving\nthe performance of the model on Chinese financial texts. Through comparative\nexperiments with advanced models such as Synchronous Double-channel Recurrent\nNetwork (SDRN) and Grid Tagging Scheme (GTS), the proposed GCGTS model\ndemonstrates significant improvements in performance.\n","authors":["Qi Chen","Dexi Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06548v3","updated":"2023-08-04T01:36:56Z","published":"2023-06-11T00:23:25Z","title":"Inductive reasoning in humans and large language models","summary":"  The impressive recent performance of large language models has led many to\nwonder to what extent they can serve as models of general intelligence or are\nsimilar to human cognition. We address this issue by applying GPT-3.5 and GPT-4\nto a classic problem in human inductive reasoning known as property induction.\nOver two experiments, we elicit human judgments on a range of property\ninduction tasks spanning multiple domains. Although GPT-3.5 struggles to\ncapture many aspects of human behaviour, GPT-4 is much more successful: for the\nmost part, its performance qualitatively matches that of humans, and the only\nnotable exception is its failure to capture the phenomenon of premise\nnon-monotonicity. Our work demonstrates that property induction allows for\ninteresting comparisons between human and machine intelligence and provides two\nlarge datasets that can serve as benchmarks for future work in this vein.\n","authors":["Simon J. Han","Keith Ransom","Andrew Perfors","Charles Kemp"],"pdf_url":"https://arxiv.org/pdf/2306.06548v3.pdf","comment":"61 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.02103v1","updated":"2023-08-04T01:34:46Z","published":"2023-08-04T01:34:46Z","title":"Prompt2Gaussia: Uncertain Prompt-learning for Script Event Prediction","summary":"  Script Event Prediction (SEP) aims to predict the subsequent event for a\ngiven event chain from a candidate list. Prior research has achieved great\nsuccess by integrating external knowledge to enhance the semantics, but it is\nlaborious to acquisite the appropriate knowledge resources and retrieve the\nscript-related knowledge. In this paper, we regard public pre-trained language\nmodels as knowledge bases and automatically mine the script-related knowledge\nvia prompt-learning. Still, the scenario-diversity and label-ambiguity in\nscripts make it uncertain to construct the most functional prompt and label\ntoken in prompt learning, i.e., prompt-uncertainty and verbalizer-uncertainty.\nConsidering the innate ability of Gaussian distribution to express uncertainty,\nwe deploy the prompt tokens and label tokens as random variables following\nGaussian distributions, where a prompt estimator and a verbalizer estimator are\nproposed to estimate their probabilistic representations instead of\ndeterministic representations. We take the lead to explore prompt-learning in\nSEP and provide a fresh perspective to enrich the script semantics. Our method\nis evaluated on the most widely used benchmark and a newly proposed large-scale\none. Experiments show that our method, which benefits from knowledge evoked\nfrom pre-trained language models, outperforms prior baselines by 1.46\\% and\n1.05\\% on two benchmarks, respectively.\n","authors":["Shiyao Cui","Xin Cong","Jiawei Sheng","Xuebin Wang","Tingwen Liu","Jinqiao Shi"],"pdf_url":"https://arxiv.org/pdf/2308.02103v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.01404v2","updated":"2023-08-04T00:57:06Z","published":"2023-07-05T17:22:09Z","title":"Hoodwinked: Deception and Cooperation in a Text-Based Game for Language\n  Models","summary":"  Are current language models capable of deception and lie detection? We study\nthis question by introducing a text-based game called $\\textit{Hoodwinked}$,\ninspired by Mafia and Among Us. Players are locked in a house and must find a\nkey to escape, but one player is tasked with killing the others. Each time a\nmurder is committed, the surviving players have a natural language discussion\nthen vote to banish one player from the game. We conduct experiments with\nagents controlled by GPT-3, GPT-3.5, and GPT-4 and find evidence of deception\nand lie detection capabilities. The killer often denies their crime and accuses\nothers, leading to measurable effects on voting outcomes. More advanced models\nare more effective killers, outperforming smaller models in 18 of 24 pairwise\ncomparisons. Secondary metrics provide evidence that this improvement is not\nmediated by different actions, but rather by stronger persuasive skills during\ndiscussions. To evaluate the ability of AI agents to deceive humans, we make\nthis game publicly available at h https://hoodwinked.ai/ .\n","authors":["Aidan O'Gara"],"pdf_url":"https://arxiv.org/pdf/2308.01404v2.pdf","comment":"Added reference for McKenzie 2023; updated acknowledgements"},{"id":"http://arxiv.org/abs/2308.02092v1","updated":"2023-08-04T00:23:14Z","published":"2023-08-04T00:23:14Z","title":"N-gram Boosting: Improving Contextual Biasing with Normalized N-gram\n  Targets","summary":"  Accurate transcription of proper names and technical terms is particularly\nimportant in speech-to-text applications for business conversations. These\nwords, which are essential to understanding the conversation, are often rare\nand therefore likely to be under-represented in text and audio training data,\ncreating a significant challenge in this domain. We present a two-step keyword\nboosting mechanism that successfully works on normalized unigrams and n-grams\nrather than just single tokens, which eliminates missing hits issues with\nboosting raw targets. In addition, we show how adjusting the boosting weight\nlogic avoids over-boosting multi-token keywords. This improves our keyword\nrecognition rate by 26% relative on our proprietary in-domain dataset and 2% on\nLibriSpeech. This method is particularly useful on targets that involve\nnon-alphabetic characters or have non-standard pronunciations.\n","authors":["Wang Yau Li","Shreekantha Nadig","Karol Chang","Zafarullah Mahmood","Riqiang Wang","Simon Vandieken","Jonas Robertson","Fred Mailhot"],"pdf_url":"https://arxiv.org/pdf/2308.02092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02746v1","updated":"2023-08-04T23:50:58Z","published":"2023-08-04T23:50:58Z","title":"Meta-Tsallis-Entropy Minimization: A New Self-Training Approach for\n  Domain Adaptation on Text Classification","summary":"  Text classification is a fundamental task for natural language processing,\nand adapting text classification models across domains has broad applications.\nSelf-training generates pseudo-examples from the model's predictions and\niteratively trains on the pseudo-examples, i.e., minimizes the loss on the\nsource domain and the Gibbs entropy on the target domain. However, Gibbs\nentropy is sensitive to prediction errors, and thus, self-training tends to\nfail when the domain shift is large. In this paper, we propose Meta-Tsallis\nEntropy minimization (MTEM), which applies a meta-learning algorithm to\noptimize the instance adaptive Tsallis entropy on the target domain. To reduce\nthe computation cost of MTEM, we propose an approximation technique to\napproximate the Second-order derivation involved in the meta-learning. To\nefficiently generate pseudo labels, we propose an annealing sampling mechanism\nfor exploring the model's prediction probability. Theoretically, we prove the\nconvergence of the meta-learning algorithm in MTEM and analyze the\neffectiveness of MTEM in achieving domain adaptation. Experimentally, MTEM\nimproves the adaptation performance of BERT with an average of 4 percent on the\nbenchmark dataset.\n","authors":["Menglong Lu","Zhen Huang","Zhiliang Tian","Yunxiang Zhao","Xuanyu Fei","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.02746v1.pdf","comment":"This paper was accepted by IJCAI 2023, and the uploaded file includes\n  9 pages of main contents(including two pages of reference) plus 10 pages of\n  appendix"},{"id":"http://arxiv.org/abs/2305.10270v3","updated":"2023-08-04T22:58:39Z","published":"2023-05-17T15:02:20Z","title":"Boosting Local Spectro-Temporal Features for Speech Analysis","summary":"  We introduce the problem of phone classification in the context of speech\nrecognition, and explore several sets of local spectro-temporal features that\ncan be used for phone classification. In particular, we present some\npreliminary results for phone classification using two sets of features that\nare commonly used for object detection: Haar features and SVM-classified\nHistograms of Gradients (HoG).\n","authors":["Michael Guerzhoy"],"pdf_url":"https://arxiv.org/pdf/2305.10270v3.pdf","comment":"Master's project, University of Toronto, 2010"},{"id":"http://arxiv.org/abs/2308.02727v1","updated":"2023-08-04T22:14:19Z","published":"2023-08-04T22:14:19Z","title":"How Good Are SOTA Fake News Detectors","summary":"  Automatic fake news detection with machine learning can prevent the\ndissemination of false statements before they gain many views. Several datasets\nlabeling statements as legitimate or false have been created since the 2016\nUnited States presidential election for the prospect of training machine\nlearning models. We evaluate the robustness of both traditional and deep\nstate-of-the-art models to gauge how well they may perform in the real world.\nWe find that traditional models tend to generalize better to data outside the\ndistribution it was trained on compared to more recently-developed large\nlanguage models, though the best model to use may depend on the specific task\nat hand.\n","authors":["Matthew Iceland"],"pdf_url":"https://arxiv.org/pdf/2308.02727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01008v2","updated":"2023-08-04T21:17:28Z","published":"2023-03-31T16:11:56Z","title":"Self-Supervised Multimodal Learning: A Survey","summary":"  Multimodal learning, which aims to understand and analyze information from\nmultiple modalities, has achieved substantial progress in the supervised regime\nin recent years. However, the heavy dependence on data paired with expensive\nhuman annotations impedes scaling up models. Meanwhile, given the availability\nof large-scale unannotated data in the wild, self-supervised learning has\nbecome an attractive strategy to alleviate the annotation bottleneck. Building\non these two directions, self-supervised multimodal learning (SSML) provides\nways to learn from raw multimodal data. In this survey, we provide a\ncomprehensive review of the state-of-the-art in SSML, in which we elucidate\nthree major challenges intrinsic to self-supervised learning with multimodal\ndata: (1) learning representations from multimodal data without labels, (2)\nfusion of different modalities, and (3) learning with unaligned data. We then\ndetail existing solutions to these challenges. Specifically, we consider (1)\nobjectives for learning from multimodal unlabeled data via self-supervision,\n(2) model architectures from the perspective of different multimodal fusion\nstrategies, and (3) pair-free learning strategies for coarse-grained and\nfine-grained alignment. We also review real-world applications of SSML\nalgorithms in diverse fields such as healthcare, remote sensing, and machine\ntranslation. Finally, we discuss challenges and future directions for SSML. A\ncollection of related resources can be found at:\nhttps://github.com/ys-zong/awesome-self-supervised-multimodal-learning.\n","authors":["Yongshuo Zong","Oisin Mac Aodha","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2304.01008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11315v2","updated":"2023-08-04T19:36:31Z","published":"2023-07-21T02:47:18Z","title":"GIST: Generating Image-Specific Text for Fine-grained Object\n  Classification","summary":"  Recent vision-language models outperform vision-only models on many image\nclassification tasks. However, because of the absence of paired text/image\ndescriptions, it remains difficult to fine-tune these models for fine-grained\nimage classification. In this work, we propose a method, GIST, for generating\nimage-specific fine-grained text descriptions from image-only datasets, and\nshow that these text descriptions can be used to improve classification. Key\nparts of our method include 1. prompting a pretrained large language model with\ndomain-specific prompts to generate diverse fine-grained text descriptions for\neach class and 2. using a pretrained vision-language model to match each image\nto label-preserving text descriptions that capture relevant visual features in\nthe image. We demonstrate the utility of GIST by fine-tuning vision-language\nmodels on the image-and-generated-text pairs to learn an aligned\nvision-language representation space for improved classification. We evaluate\nour learned representation space in full-shot and few-shot scenarios across\nfour diverse fine-grained classification datasets, each from a different\ndomain. Our method achieves an average improvement of $4.1\\%$ in accuracy over\nCLIP linear probes and an average of $1.1\\%$ improvement in accuracy over the\nprevious state-of-the-art image-text classification method on the full-shot\ndatasets. Our method achieves similar improvements across few-shot regimes.\nCode is available at https://github.com/emu1729/GIST.\n","authors":["Kathleen M. Lewis","Emily Mu","Adrian V. Dalca","John Guttag"],"pdf_url":"https://arxiv.org/pdf/2307.11315v2.pdf","comment":"The first two authors contributed equally to this work and are listed\n  in alphabetical order"},{"id":"http://arxiv.org/abs/2305.11473v2","updated":"2023-08-04T18:23:55Z","published":"2023-05-19T06:53:25Z","title":"Graphologue: Exploring Large Language Model Responses with Interactive\n  Diagrams","summary":"  Large language models (LLMs) have recently soared in popularity due to their\nease of access and the unprecedented ability to synthesize text responses to\ndiverse user questions. However, LLMs like ChatGPT present significant\nlimitations in supporting complex information tasks due to the insufficient\naffordances of the text-based medium and linear conversational structure.\nThrough a formative study with ten participants, we found that LLM interfaces\noften present long-winded responses, making it difficult for people to quickly\ncomprehend and interact flexibly with various pieces of information,\nparticularly during more complex tasks. We present Graphologue, an interactive\nsystem that converts text-based responses from LLMs into graphical diagrams to\nfacilitate information-seeking and question-answering tasks. Graphologue\nemploys novel prompting strategies and interface designs to extract entities\nand relationships from LLM responses and constructs node-link diagrams in\nreal-time. Further, users can interact with the diagrams to flexibly adjust the\ngraphical presentation and to submit context-specific prompts to obtain more\ninformation. Utilizing diagrams, Graphologue enables graphical, non-linear\ndialogues between humans and LLMs, facilitating information exploration,\norganization, and comprehension.\n","authors":["Peiling Jiang","Jude Rayan","Steven P. Dow","Haijun Xia"],"pdf_url":"https://arxiv.org/pdf/2305.11473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02618v1","updated":"2023-08-04T14:50:37Z","published":"2023-08-04T14:50:37Z","title":"ChatGPT for GTFS: From Words to Information","summary":"  The General Transit Feed Specification (GTFS) standard for publishing transit\ndata is ubiquitous. GTFS being tabular data, with information spread across\ndifferent files, necessitates specialized tools or packages to retrieve\ninformation. Concurrently, the use of Large Language Models for text and\ninformation retrieval is growing. The idea of this research is to see if the\ncurrent widely adopted LLMs (ChatGPT) are able to retrieve information from\nGTFS using natural language instructions. We first test whether ChatGPT\n(GPT-3.5) understands the GTFS specification. GPT-3.5 answers 77% of our\nmultiple-choice questions (MCQ) correctly. Next, we task the LLM with\ninformation extractions from a filtered GTFS feed with 4 routes. For\ninformation retrieval, we compare zero-shot and program synthesis. Program\nsynthesis works better, achieving ~90% accuracy on simple questions and ~40%\naccuracy on complex questions.\n","authors":["Saipraneeth Devunuri","Shirin Qiam","Lewis Lehe"],"pdf_url":"https://arxiv.org/pdf/2308.02618v1.pdf","comment":"18 pages, 7 figures, 1 table, Transportation Research Board"},{"id":"http://arxiv.org/abs/2308.03795v1","updated":"2023-08-04T21:44:16Z","published":"2023-08-04T21:44:16Z","title":"Forget Demonstrations, Focus on Learning from Textual Instructions","summary":"  This work studies a challenging yet more realistic setting for zero-shot\ncross-task generalization: demonstration-free learning from textual\ninstructions, presuming the existence of a paragraph-style task definition\nwhile no demonstrations exist. To better learn the task supervision from the\ndefinition, we propose two strategies: first, to automatically find out the\ncritical sentences in the definition; second, a ranking objective to force the\nmodel to generate the gold outputs with higher probabilities when those\ncritical parts are highlighted in the definition. The joint efforts of the two\nstrategies yield state-of-the-art performance on the challenging benchmark. Our\ncode will be released in the final version of the paper.\n","authors":["Renze Lou","Wenpeng Yin"],"pdf_url":"https://arxiv.org/pdf/2308.03795v1.pdf","comment":"Preprint"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.02490v1","updated":"2023-08-04T17:59:47Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":"  We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models. Code and data are\navailable at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v1.pdf","comment":"Code and data: https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2308.02487v1","updated":"2023-08-04T17:59:01Z","published":"2023-08-04T17:59:01Z","title":"Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen\n  Convolutional CLIP","summary":"  Open-vocabulary segmentation is a challenging task requiring segmenting and\nrecognizing objects from an open set of categories. One way to address this\nchallenge is to leverage multi-modal models, such as CLIP, to provide image and\ntext features in a shared embedding space, which bridges the gap between\nclosed-vocabulary and open-vocabulary recognition. Hence, existing methods\noften adopt a two-stage framework to tackle the problem, where the inputs first\ngo through a mask generator and then through the CLIP model along with the\npredicted masks. This process involves extracting features from images multiple\ntimes, which can be ineffective and inefficient. By contrast, we propose to\nbuild everything into a single-stage framework using a shared Frozen\nConvolutional CLIP backbone, which not only significantly simplifies the\ncurrent two-stage pipeline, but also remarkably yields a better accuracy-cost\ntrade-off. The proposed FC-CLIP, benefits from the following observations: the\nfrozen CLIP backbone maintains the ability of open-vocabulary classification\nand can also serve as a strong mask generator, and the convolutional CLIP\ngeneralizes well to a larger input resolution than the one used during\ncontrastive image-text pretraining. When training on COCO panoptic data only\nand testing in a zero-shot manner, FC-CLIP achieve 26.8 PQ, 16.8 AP, and 34.1\nmIoU on ADE20K, 18.2 PQ, 27.9 mIoU on Mapillary Vistas, 44.0 PQ, 26.8 AP, 56.2\nmIoU on Cityscapes, outperforming the prior art by +4.2 PQ, +2.4 AP, +4.2 mIoU\non ADE20K, +4.0 PQ on Mapillary Vistas and +20.1 PQ on Cityscapes,\nrespectively. Additionally, the training and testing time of FC-CLIP is 7.5x\nand 6.6x significantly faster than the same prior art, while using 5.9x fewer\nparameters. FC-CLIP also sets a new state-of-the-art performance across various\nopen-vocabulary semantic segmentation datasets. Code at\nhttps://github.com/bytedance/fc-clip\n","authors":["Qihang Yu","Ju He","Xueqing Deng","Xiaohui Shen","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2308.02487v1.pdf","comment":"code and model available at https://github.com/bytedance/fc-clip"},{"id":"http://arxiv.org/abs/2308.01236v2","updated":"2023-08-04T17:51:57Z","published":"2023-08-02T15:44:36Z","title":"Grounded Image Text Matching with Mismatched Relation Reasoning","summary":"  This paper introduces Grounded Image Text Matching with Mismatched Relation\n(GITM-MR), a novel visual-linguistic joint task that evaluates the relation\nunderstanding capabilities of transformer-based pre-trained models. GITM-MR\nrequires a model to first determine if an expression describes an image, then\nlocalize referred objects or ground the mismatched parts of the text. We\nprovide a benchmark for evaluating pre-trained models on this task, with a\nfocus on the challenging settings of limited data and out-of-distribution\nsentence lengths. Our evaluation demonstrates that pre-trained models lack data\nefficiency and length generalization ability. To address this, we propose the\nRelation-sensitive Correspondence Reasoning Network (RCRN), which incorporates\nrelation-aware reasoning via bi-directional message propagation guided by\nlanguage structure. RCRN can be interpreted as a modular program and delivers\nstrong performance in both length generalization and data efficiency.\n","authors":["Yu Wu","Yana Wei","Haozhe Wang","Yongfei Liu","Sibei Yang","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2308.01236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04166v3","updated":"2023-08-04T17:36:36Z","published":"2023-06-07T05:36:45Z","title":"BAA-NGP: Bundle-Adjusting Accelerated Neural Graphics Primitives","summary":"  Implicit neural representation has emerged as a powerful method for\nreconstructing 3D scenes from 2D images. Given a set of camera poses and\nassociated images, the models can be trained to synthesize novel, unseen views.\nIn order to expand the use cases for implicit neural representations, we need\nto incorporate camera pose estimation capabilities as part of the\nrepresentation learning, as this is necessary for reconstructing scenes from\nreal-world video sequences where cameras are generally not being tracked.\nExisting approaches like COLMAP and, most recently, bundle-adjusting neural\nradiance field methods often suffer from lengthy processing times. These delays\nranging from hours to days, arise from laborious feature matching, hardware\nlimitations, dense point sampling, and long training times required by a\nmulti-layer perceptron structure with a large number of parameters. To address\nthese challenges, we propose a framework called bundle-adjusting accelerated\nneural graphics primitives (BAA-NGP). Our approach leverages accelerated\nsampling and hash encoding to expedite both pose refinement/estimation and 3D\nscene reconstruction. Experimental results demonstrate that our method achieves\na more than 10 to 20 $\\times$ speed improvement in novel view synthesis\ncompared to other bundle-adjusting neural radiance field methods without\nsacrificing the quality of pose estimation. The github repository can be found\nhere https://github.com/IntelLabs/baa-ngp.\n","authors":["Sainan Liu","Shan Lin","Jingpei Lu","Shreya Saha","Alexey Supikov","Michael Yip"],"pdf_url":"https://arxiv.org/pdf/2306.04166v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05308v2","updated":"2023-08-04T17:13:34Z","published":"2022-11-10T03:02:12Z","title":"Enhancing Clinical Support for Breast Cancer with Deep Learning Models\n  using Synthetic Correlated Diffusion Imaging","summary":"  Breast cancer is the second most common type of cancer in women in Canada and\nthe United States, representing over 25\\% of all new female cancer cases. As\nsuch, there has been immense research and progress on improving screening and\nclinical support for breast cancer. In this paper, we investigate enhancing\nclinical support for breast cancer with deep learning models using a newly\nintroduced magnetic resonance imaging (MRI) modality called synthetic\ncorrelated diffusion imaging (CDI$^s$). More specifically, we leverage a\nvolumetric convolutional neural network to learn volumetric deep radiomic\nfeatures from a pre-treatment cohort and construct a predictor based on the\nlearnt features for grade and post-treatment response prediction. As the first\nstudy to learn CDI$^s$-centric radiomic sequences within a deep learning\nperspective for clinical decision support, we evaluated the proposed approach\nusing the ACRIN-6698 study against those learnt using gold-standard imaging\nmodalities. We find that the proposed approach can achieve better performance\nfor both grade and post-treatment response prediction and thus may be a useful\ntool to aid oncologists in improving recommendation of treatment of patients.\nSubsequently, the approach to leverage volumetric deep radiomic features for\nbreast cancer can be further extended to other applications of CDI$^s$ in the\ncancer domain to further improve clinical support.\n","authors":["Chi-en Amy Tai","Hayden Gunraj","Nedim Hodzic","Nic Flanagan","Ali Sabri","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2211.05308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02463v1","updated":"2023-08-04T17:00:38Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":"  In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of data, model design, and evaluation thoroughly. Our\ncontribution can be concluded as follows: (i), we construct a large-scale\nMedical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.\nTo the best of our knowledge, this is the first multi-modal dataset containing\n3D medical scans. (ii), We propose an architecture that enables visually\nconditioned generative pre-training, allowing for the integration of text input\ninterleaved with 2D or 3D medical scans to generate response for diverse\nradiologic tasks. The model was initially pre-trained on MedMD and subsequently\ndomain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,\ncontaining 3M radiologic visual-language pairs. (iii), we propose a new\nevaluation benchmark that comprises five tasks, aiming to comprehensively\nassess the capability of foundation models in handling practical clinical\nproblems. Our experimental results confirm that RadFM significantly outperforms\nexisting multi-modal foundation models. The codes, data, and model checkpoint\nwill all be made publicly available to promote further research and development\nin the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15539v3","updated":"2023-08-04T16:16:28Z","published":"2023-07-28T13:07:42Z","title":"Beating Backdoor Attack at Its Own Game","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not\naffect the network's performance on clean data but would manipulate the network\nbehavior once a trigger pattern is added. Existing defense methods have greatly\nreduced attack success rate, but their prediction accuracy on clean data still\nlags behind a clean model by a large margin. Inspired by the stealthiness and\neffectiveness of backdoor attack, we propose a simple but highly effective\ndefense framework which injects non-adversarial backdoors targeting poisoned\nsamples. Following the general steps in backdoor attack, we detect a small set\nof suspected samples and then apply a poisoning strategy to them. The\nnon-adversarial backdoor, once triggered, suppresses the attacker's backdoor on\npoisoned data, but has limited influence on clean data. The defense can be\ncarried out during data preprocessing, without any modification to the standard\nend-to-end training pipeline. We conduct extensive experiments on multiple\nbenchmarks with different architectures and representative attacks. Results\ndemonstrate that our method achieves state-of-the-art defense effectiveness\nwith by far the lowest performance drop on clean data. Considering the\nsurprising defense ability displayed by our framework, we call for more\nattention to utilizing backdoor for backdoor defense. Code is available at\nhttps://github.com/damianliumin/non-adversarial_backdoor.\n","authors":["Min Liu","Alberto Sangiovanni-Vincentelli","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.15539v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2112.09195v3","updated":"2023-08-04T16:01:02Z","published":"2021-12-16T20:42:07Z","title":"Mitigating the Bias of Centered Objects in Common Datasets","summary":"  Convolutional networks are considered shift invariant, but it was\ndemonstrated that their response may vary according to the exact location of\nthe objects. In this paper we will demonstrate that most commonly investigated\ndatasets have a bias, where objects are over-represented at the center of the\nimage during training. This bias and the boundary condition of these networks\ncan have a significant effect on the performance of these architectures and\ntheir accuracy drops significantly as an object approaches the boundary. We\nwill also demonstrate how this effect can be mitigated with data augmentation\ntechniques.\n","authors":["Gergely Szabo","Andras Horvath"],"pdf_url":"https://arxiv.org/pdf/2112.09195v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11248v2","updated":"2023-08-04T15:57:36Z","published":"2022-11-21T08:39:48Z","title":"Video Background Music Generation: Dataset, Method and Evaluation","summary":"  Music is essential when editing videos, but selecting music manually is\ndifficult and time-consuming. Thus, we seek to automatically generate\nbackground music tracks given video input. This is a challenging task since it\nrequires music-video datasets, efficient architectures for video-to-music\ngeneration, and reasonable metrics, none of which currently exist. To close\nthis gap, we introduce a complete recipe including dataset, benchmark model,\nand evaluation metric for video background music generation. We present SymMV,\na video and symbolic music dataset with various musical annotations. To the\nbest of our knowledge, it is the first video-music dataset with rich musical\nannotations. We also propose a benchmark video background music generation\nframework named V-MusProd, which utilizes music priors of chords, melody, and\naccompaniment along with video-music relations of semantic, color, and motion\nfeatures. To address the lack of objective metrics for video-music\ncorrespondence, we design a retrieval-based metric VMCP built upon a powerful\nvideo-music representation learning model. Experiments show that with our\ndataset, V-MusProd outperforms the state-of-the-art method in both music\nquality and correspondence with videos. We believe our dataset, benchmark\nmodel, and evaluation metric will boost the development of video background\nmusic generation. Our dataset and code are available at\nhttps://github.com/zhuole1025/SymMV.\n","authors":["Le Zhuo","Zhaokai Wang","Baisen Wang","Yue Liao","Chenxi Bao","Stanley Peng","Songhao Han","Aixi Zhang","Fei Fang","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2211.11248v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.02393v1","updated":"2023-08-04T15:47:06Z","published":"2023-08-04T15:47:06Z","title":"A Bi-variant Variational Model for Diffeomorphic Image Registration with\n  Relaxed Jacobian Determinant Constraints","summary":"  Diffeomorphic registration has become a powerful approach for seeking a\nsmooth and invertible spatial transformation between two coordinate systems\nwhich have been measured via the template and reference images. While the\npointwise volume-preserving constraint is effective for some problems, it is\ntoo stringent for many other problems especially when the local deformations\nare relatively large, because it may lead to a poor large-deformation for\nenforcing local matching.In this paper, we propose a novel bi-variant\ndiffeomorphic image registration model with the soft constraint of Jacobian\nequation, which allows local deformations to shrink and grow in a flexible\nrange.The Jacobian determinant of the transformation is explicitly controlled\nby optimizing the relaxation function. To prevent deformation folding and\nenhance the smoothness of deformation, we not only impose a positivity\nconstraint in optimizing the relaxation function, but also employ a regularizer\nto ensure the smoothness of the relaxation function.Furthermore, the positivity\nconstraint ensures that is as close to one as possible, which helps to obtain a\nvolume-preserving transformation on average.We further analyze the existence of\nthe minimizer for the variational model and propose a penalty splitting method\nwith a multilevel strategy to solve this model. Numerical experiments show that\nthe proposed algorithm is convergent, and the positivity constraint can control\nthe range of relative volume and not compromise registration accuracy.\nMoreover, the proposed model produces diffeomorphic maps for large deformation,\nand achieves better performance compared to the several existing registration\nmodels.\n","authors":["Yanyan Li","Ke Chen","Chong Chen","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11098v3","updated":"2023-08-04T15:18:22Z","published":"2023-03-20T13:33:31Z","title":"A closer look at the training dynamics of knowledge distillation","summary":"  In this paper we revisit the efficacy of knowledge distillation as a function\nmatching and metric learning problem. In doing so we verify three important\ndesign decisions, namely the normalisation, soft maximum function, and\nprojection layers as key ingredients. We theoretically show that the projector\nimplicitly encodes information on past examples, enabling relational gradients\nfor the student. We then show that the normalisation of representations is\ntightly coupled with the training dynamics of this projector, which can have a\nlarge impact on the students performance. Finally, we show that a simple soft\nmaximum function can be used to address any significant capacity gap problems.\nExperimental results on various benchmark datasets demonstrate that using these\ninsights can lead to superior or comparable performance to state-of-the-art\nknowledge distillation techniques, despite being much more computationally\nefficient. In particular, we obtain these results across image classification\n(CIFAR100 and ImageNet), object detection (COCO2017), and on more difficult\ndistillation objectives, such as training data efficient transformers, whereby\nwe attain a 77.2% top-1 accuracy with DeiT-Ti on ImageNet.\n","authors":["Roy Miles","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2303.11098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15764v2","updated":"2023-08-04T15:07:54Z","published":"2023-03-28T06:45:31Z","title":"X-Mesh: Towards Fast and Accurate Text-driven 3D Stylization via Dynamic\n  Textual Guidance","summary":"  Text-driven 3D stylization is a complex and crucial task in the fields of\ncomputer vision (CV) and computer graphics (CG), aimed at transforming a bare\nmesh to fit a target text. Prior methods adopt text-independent multilayer\nperceptrons (MLPs) to predict the attributes of the target mesh with the\nsupervision of CLIP loss. However, such text-independent architecture lacks\ntextual guidance during predicting attributes, thus leading to unsatisfactory\nstylization and slow convergence. To address these limitations, we present\nX-Mesh, an innovative text-driven 3D stylization framework that incorporates a\nnovel Text-guided Dynamic Attention Module (TDAM). The TDAM dynamically\nintegrates the guidance of the target text by utilizing text-relevant spatial\nand channel-wise attentions during vertex feature extraction, resulting in more\naccurate attribute prediction and faster convergence speed. Furthermore,\nexisting works lack standard benchmarks and automated metrics for evaluation,\noften relying on subjective and non-reproducible user studies to assess the\nquality of stylized 3D assets. To overcome this limitation, we introduce a new\nstandard text-mesh benchmark, namely MIT-30, and two automated metrics, which\nwill enable future research to achieve fair and objective comparisons. Our\nextensive qualitative and quantitative experiments demonstrate that X-Mesh\noutperforms previous state-of-the-art methods.\n","authors":["Yiwei Ma","Xiaioqing Zhang","Xiaoshuai Sun","Jiayi Ji","Haowei Wang","Guannan Jiang","Weilin Zhuang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2303.15764v2.pdf","comment":"12 pages, 7 figures, ICCV2023"},{"id":"http://arxiv.org/abs/2308.02369v1","updated":"2023-08-04T15:07:20Z","published":"2023-08-04T15:07:20Z","title":"Universal Defensive Underpainting Patch: Making Your Text Invisible to\n  Optical Character Recognition","summary":"  Optical Character Recognition (OCR) enables automatic text extraction from\nscanned or digitized text images, but it also makes it easy to pirate valuable\nor sensitive text from these images. Previous methods to prevent OCR piracy by\ndistorting characters in text images are impractical in real-world scenarios,\nas pirates can capture arbitrary portions of the text images, rendering the\ndefenses ineffective. In this work, we propose a novel and effective defense\nmechanism termed the Universal Defensive Underpainting Patch (UDUP) that\nmodifies the underpainting of text images instead of the characters. UDUP is\ncreated through an iterative optimization process to craft a small, fixed-size\ndefensive patch that can generate non-overlapping underpainting for text images\nof any size. Experimental results show that UDUP effectively defends against\nunauthorized OCR under the setting of any screenshot range or complex image\nbackground. It is agnostic to the content, size, colors, and languages of\ncharacters, and is robust to typical image operations such as scaling and\ncompressing. In addition, the transferability of UDUP is demonstrated by\nevading several off-the-shelf OCRs. The code is available at\nhttps://github.com/QRICKDD/UDUP.\n","authors":["JiaCheng Deng","Li Dong","Jiahao Chen","Diqun Yan","Rangding Wang","Dengpan Ye","Lingchen Zhao","Jinyu Tian"],"pdf_url":"https://arxiv.org/pdf/2308.02369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2003.09168v4","updated":"2023-08-04T14:53:42Z","published":"2020-03-20T10:03:01Z","title":"Fine-grained Species Recognition with Privileged Pooling: Better Sample\n  Efficiency Through Supervised Attention","summary":"  We propose a scheme for supervised image classification that uses privileged\ninformation, in the form of keypoint annotations for the training data, to\nlearn strong models from small and/or biased training sets. Our main motivation\nis the recognition of animal species for ecological applications such as\nbiodiversity modelling, which is challenging because of long-tailed species\ndistributions due to rare species, and strong dataset biases such as repetitive\nscene background in camera traps. To counteract these challenges, we propose a\nvisual attention mechanism that is supervised via keypoint annotations that\nhighlight important object parts. This privileged information, implemented as a\nnovel privileged pooling operation, is only required during training and helps\nthe model to focus on regions that are discriminative. In experiments with\nthree different animal species datasets, we show that deep networks with\nprivileged pooling can use small training sets more efficiently and generalize\nbetter.\n","authors":["Andres C. Rodriguez","Stefano D'Aronco","Konrad Schindler","Jan Dirk Wegner"],"pdf_url":"https://arxiv.org/pdf/2003.09168v4.pdf","comment":"Updated version with iNaturalist2018 dataset. privileged pooling,\n  supervised attention, training set bias, fine-grained species recognition,\n  camera trap images"},{"id":"http://arxiv.org/abs/2308.02363v1","updated":"2023-08-04T14:53:20Z","published":"2023-08-04T14:53:20Z","title":"Brain MRI Segmentation using Template-Based Training and Visual\n  Perception Augmentation","summary":"  Deep learning models usually require sufficient training data to achieve high\naccuracy, but obtaining labeled data can be time-consuming and labor-intensive.\nHere we introduce a template-based training method to train a 3D U-Net model\nfrom scratch using only one population-averaged brain MRI template and its\nassociated segmentation label. The process incorporated visual perception\naugmentation to enhance the model's robustness in handling diverse image inputs\nand mitigating overfitting. Leveraging this approach, we trained 3D U-Net\nmodels for mouse, rat, marmoset, rhesus, and human brain MRI to achieve\nsegmentation tasks such as skull-stripping, brain segmentation, and tissue\nprobability mapping. This tool effectively addresses the limited availability\nof training data and holds significant potential for expanding deep learning\napplications in image analysis, providing researchers with a unified solution\nto train deep neural networks with only one image sample.\n","authors":["Fang-Cheng Yeh"],"pdf_url":"https://arxiv.org/pdf/2308.02363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02356v1","updated":"2023-08-04T14:44:11Z","published":"2023-08-04T14:44:11Z","title":"T-UNet: Triplet UNet for Change Detection in High-Resolution Remote\n  Sensing Images","summary":"  Remote sensing image change detection aims to identify the differences\nbetween images acquired at different times in the same area. It is widely used\nin land management, environmental monitoring, disaster assessment and other\nfields. Currently, most change detection methods are based on Siamese network\nstructure or early fusion structure. Siamese structure focuses on extracting\nobject features at different times but lacks attention to change information,\nwhich leads to false alarms and missed detections. Early fusion (EF) structure\nfocuses on extracting features after the fusion of images of different phases\nbut ignores the significance of object features at different times for\ndetecting change details, making it difficult to accurately discern the edges\nof changed objects. To address these issues and obtain more accurate results,\nwe propose a novel network, Triplet UNet(T-UNet), based on a three-branch\nencoder, which is capable to simultaneously extract the object features and the\nchange features between the pre- and post-time-phase images through triplet\nencoder. To effectively interact and fuse the features extracted from the three\nbranches of triplet encoder, we propose a multi-branch spatial-spectral\ncross-attention module (MBSSCA). In the decoder stage, we introduce the channel\nattention mechanism (CAM) and spatial attention mechanism (SAM) to fully mine\nand integrate detailed textures information at the shallow layer and semantic\nlocalization information at the deep layer.\n","authors":["Huan Zhong","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2308.02356v1.pdf","comment":"21 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.02351v1","updated":"2023-08-04T14:39:05Z","published":"2023-08-04T14:39:05Z","title":"A Parameter-efficient Multi-subject Model for Predicting fMRI Activity","summary":"  This is the Algonauts 2023 submission report for team \"BlobGPT\". Our model\nconsists of a multi-subject linear encoding head attached to a pretrained trunk\nmodel. The multi-subject head consists of three components: (1) a shared\nmulti-layer feature projection, (2) shared plus subject-specific low-dimension\nlinear transformations, and (3) a shared PCA fMRI embedding. In this report, we\nexplain these components in more detail and present some experimental results.\nOur code is available at https://github.com/cmi-dair/algonauts23.\n","authors":["Connor Lane","Gregory Kiar"],"pdf_url":"https://arxiv.org/pdf/2308.02351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02350v1","updated":"2023-08-04T14:37:12Z","published":"2023-08-04T14:37:12Z","title":"RobustMQ: Benchmarking Robustness of Quantized Models","summary":"  Quantization has emerged as an essential technique for deploying deep neural\nnetworks (DNNs) on devices with limited resources. However, quantized models\nexhibit vulnerabilities when exposed to various noises in real-world\napplications. Despite the importance of evaluating the impact of quantization\non robustness, existing research on this topic is limited and often disregards\nestablished principles of robustness evaluation, resulting in incomplete and\ninconclusive findings. To address this gap, we thoroughly evaluated the\nrobustness of quantized models against various noises (adversarial attacks,\nnatural corruptions, and systematic noises) on ImageNet. The comprehensive\nevaluation results empirically provide valuable insights into the robustness of\nquantized models in various scenarios, for example: (1) quantized models\nexhibit higher adversarial robustness than their floating-point counterparts,\nbut are more vulnerable to natural corruptions and systematic noises; (2) in\ngeneral, increasing the quantization bit-width results in a decrease in\nadversarial robustness, an increase in natural robustness, and an increase in\nsystematic robustness; (3) among corruption methods, \\textit{impulse noise} and\n\\textit{glass blur} are the most harmful to quantized models, while\n\\textit{brightness} has the least impact; (4) among systematic noises, the\n\\textit{nearest neighbor interpolation} has the highest impact, while bilinear\ninterpolation, cubic interpolation, and area interpolation are the three least\nharmful. Our research contributes to advancing the robust quantization of\nmodels and their deployment in real-world scenarios.\n","authors":["Yisong Xiao","Aishan Liu","Tianyuan Zhang","Haotong Qin","Jinyang Guo","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02350v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.02346v1","updated":"2023-08-04T14:20:42Z","published":"2023-08-04T14:20:42Z","title":"Class Incremental Learning with Self-Supervised Pre-Training and\n  Prototype Learning","summary":"  Deep Neural Network (DNN) has achieved great success on datasets of closed\nclass set. However, new classes, like new categories of social media topics,\nare continuously added to the real world, making it necessary to incrementally\nlearn. This is hard for DNN because it tends to focus on fitting to new classes\nwhile ignoring old classes, a phenomenon known as catastrophic forgetting.\nState-of-the-art methods rely on knowledge distillation and data replay\ntechniques but still have limitations. In this work, we analyze the causes of\ncatastrophic forgetting in class incremental learning, which owes to three\nfactors: representation drift, representation confusion, and classifier\ndistortion. Based on this view, we propose a two-stage learning framework with\na fixed encoder and an incrementally updated prototype classifier. The encoder\nis trained with self-supervised learning to generate a feature space with high\nintrinsic dimensionality, thus improving its transferability and generality.\nThe classifier incrementally learns new prototypes while retaining the\nprototypes of previously learned data, which is crucial in preserving the\ndecision boundary.Our method does not rely on preserved samples of old classes,\nis thus a non-exemplar based CIL method. Experiments on public datasets show\nthat our method can significantly outperform state-of-the-art exemplar-based\nmethods when they reserved 5 examplers per class, under the incremental setting\nof 10 phases, by 18.24% on CIFAR-100 and 9.37% on ImageNet100.\n","authors":["Wenzhuo Liu","Xinjian Wu","Fei Zhu","Mingming Yu","Chuang Wang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02346v1.pdf","comment":"This paper has been under review by a journal since 19-Apr-2023"},{"id":"http://arxiv.org/abs/2303.11591v2","updated":"2023-08-04T14:15:39Z","published":"2023-03-21T04:42:39Z","title":"SVCNet: Scribble-based Video Colorization Network with Temporal\n  Aggregation","summary":"  In this paper, we propose a scribble-based video colorization network with\ntemporal aggregation called SVCNet. It can colorize monochrome videos based on\ndifferent user-given color scribbles. It addresses three common issues in the\nscribble-based video colorization area: colorization vividness, temporal\nconsistency, and color bleeding. To improve the colorization quality and\nstrengthen the temporal consistency, we adopt two sequential sub-networks in\nSVCNet for precise colorization and temporal smoothing, respectively. The first\nstage includes a pyramid feature encoder to incorporate color scribbles with a\ngrayscale frame, and a semantic feature encoder to extract semantics. The\nsecond stage finetunes the output from the first stage by aggregating the\ninformation of neighboring colorized frames (as short-range connections) and\nthe first colorized frame (as a long-range connection). To alleviate the color\nbleeding artifacts, we learn video colorization and segmentation\nsimultaneously. Furthermore, we set the majority of operations on a fixed small\nimage resolution and use a Super-resolution Module at the tail of SVCNet to\nrecover original sizes. It allows the SVCNet to fit different image resolutions\nat the inference. Finally, we evaluate the proposed SVCNet on DAVIS and Videvo\nbenchmarks. The experimental results demonstrate that SVCNet produces both\nhigher-quality and more temporally consistent videos than other well-known\nvideo colorization approaches. The codes and models can be found at\nhttps://github.com/zhaoyuzhi/SVCNet.\n","authors":["Yuzhi Zhao","Lai-Man Po","Kangcheng Liu","Xuehui Wang","Wing-Yin Yu","Pengfei Xian","Yujia Zhang","Mengyang Liu"],"pdf_url":"https://arxiv.org/pdf/2303.11591v2.pdf","comment":"accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2308.02340v1","updated":"2023-08-04T14:12:33Z","published":"2023-08-04T14:12:33Z","title":"Generative Image Priors for MRI Reconstruction Trained from\n  Magnitude-Only Images","summary":"  Purpose: In this work, we present a workflow to construct generic and robust\ngenerative image priors from magnitude-only images. The priors can then be used\nfor regularization in reconstruction to improve image quality. Methods: The\nworkflow begins with the preparation of training datasets from magnitude-only\nMR images. This dataset is then augmented with phase information and used to\ntrain generative priors of complex images. Finally, trained priors are\nevaluated using both linear and nonlinear reconstruction for compressed sensing\nparallel imaging with various undersampling schemes. Results: The results of\nour experiments demonstrate that priors trained on complex images outperform\npriors trained only on magnitude images. Additionally, a prior trained on a\nlarger dataset exhibits higher robustness. Finally, we show that the generative\npriors are superior to L1 -wavelet regularization for compressed sensing\nparallel imaging with high undersampling. Conclusion: These findings stress the\nimportance of incorporating phase information and leveraging large datasets to\nraise the performance and reliability of the generative priors for MRI\nreconstruction. Phase augmentation makes it possible to use existing image\ndatabases for training.\n","authors":["Guanxiong Luo","Xiaoqing Wang","Mortiz Blumenthal","Martin Schilling","Erik Hans Ulrich Rauf","Raviteja Kotikalapudi","Niels Focke","Martin Uecker"],"pdf_url":"https://arxiv.org/pdf/2308.02340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02339v1","updated":"2023-08-04T14:12:32Z","published":"2023-08-04T14:12:32Z","title":"Improving Scene Graph Generation with Superpixel-Based Interaction\n  Learning","summary":"  Recent advances in Scene Graph Generation (SGG) typically model the\nrelationships among entities utilizing box-level features from pre-defined\ndetectors. We argue that an overlooked problem in SGG is the coarse-grained\ninteractions between boxes, which inadequately capture contextual semantics for\nrelationship modeling, practically limiting the development of the field. In\nthis paper, we take the initiative to explore and propose a generic paradigm\ntermed Superpixel-based Interaction Learning (SIL) to remedy coarse-grained\ninteractions at the box level. It allows us to model fine-grained interactions\nat the superpixel level in SGG. Specifically, (i) we treat a scene as a set of\npoints and cluster them into superpixels representing sub-regions of the scene.\n(ii) We explore intra-entity and cross-entity interactions among the\nsuperpixels to enrich fine-grained interactions between entities at an earlier\nstage. Extensive experiments on two challenging benchmarks (Visual Genome and\nOpen Image V6) prove that our SIL enables fine-grained interaction at the\nsuperpixel level above previous box-level methods, and significantly\noutperforms previous state-of-the-art methods across all metrics. More\nencouragingly, the proposed method can be applied to boost the performance of\nexisting box-level approaches in a plug-and-play fashion. In particular, SIL\nbrings an average improvement of 2.0% mR (even up to 3.4%) of baselines for the\nPredCls task on Visual Genome, which facilitates its integration into any\nexisting box-level method.\n","authors":["Jingyi Wang","Can Zhang","Jinfa Huang","Botao Ren","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2308.02339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02335v1","updated":"2023-08-04T14:06:44Z","published":"2023-08-04T14:06:44Z","title":"RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph\n  Classification","summary":"  Graph classification is a crucial task in many real-world multimedia\napplications, where graphs can represent various multimedia data types such as\nimages, videos, and social networks. Previous efforts have applied graph neural\nnetworks (GNNs) in balanced situations where the class distribution is\nbalanced. However, real-world data typically exhibit long-tailed class\ndistributions, resulting in a bias towards the head classes when using GNNs and\nlimited generalization ability over the tail classes. Recent approaches mainly\nfocus on re-balancing different classes during model training, which fails to\nexplicitly introduce new knowledge and sacrifices the performance of the head\nclasses. To address these drawbacks, we propose a novel framework called\nRetrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature\nextractor and an unbiased classifier in a decoupled manner. In the feature\nextractor training stage, we develop a graph retrieval module to search for\nrelevant graphs that directly enrich the intra-class diversity for the tail\nclasses. Moreover, we innovatively optimize a category-centered supervised\ncontrastive loss to obtain discriminative representations, which is more\nsuitable for long-tailed scenarios. In the classifier fine-tuning stage, we\nbalance the classifier weights with two weight regularization techniques, i.e.,\nMax-norm and weight decay. Experiments on various popular benchmarks verify the\nsuperiority of the proposed method against state-of-the-art approaches.\n","authors":["Zhengyang Mao","Wei Ju","Yifang Qin","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02335v1.pdf","comment":"Accepted by the ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2304.04653v2","updated":"2023-08-04T13:36:29Z","published":"2023-04-10T15:24:29Z","title":"Do We Train on Test Data? The Impact of Near-Duplicates on License Plate\n  Recognition","summary":"  This work draws attention to the large fraction of near-duplicates in the\ntraining and test sets of datasets widely adopted in License Plate Recognition\n(LPR) research. These duplicates refer to images that, although different, show\nthe same license plate. Our experiments, conducted on the two most popular\ndatasets in the field, show a substantial decrease in recognition rate when six\nwell-known models are trained and tested under fair splits, that is, in the\nabsence of duplicates in the training and test sets. Moreover, in one of the\ndatasets, the ranking of models changed considerably when they were trained and\ntested under duplicate-free splits. These findings suggest that such duplicates\nhave significantly biased the evaluation and development of deep learning-based\nmodels for LPR. The list of near-duplicates we have found and proposals for\nfair splits are publicly available for further research at\nhttps://raysonlaroca.github.io/supp/lpr-train-on-test/\n","authors":["Rayson Laroca","Valter Estevam","Alceu S. Britto Jr.","Rodrigo Minetto","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2304.04653v2.pdf","comment":"Accepted for presentation at the International Joint Conference on\n  Neural Networks (IJCNN) 2023"},{"id":"http://arxiv.org/abs/2308.01634v2","updated":"2023-08-04T13:22:08Z","published":"2023-08-03T09:09:28Z","title":"Disentangling Multi-view Representations Beyond Inductive Bias","summary":"  Multi-view (or -modality) representation learning aims to understand the\nrelationships between different view representations. Existing methods\ndisentangle multi-view representations into consistent and view-specific\nrepresentations by introducing strong inductive biases, which can limit their\ngeneralization ability. In this paper, we propose a novel multi-view\nrepresentation disentangling method that aims to go beyond inductive biases,\nensuring both interpretability and generalizability of the resulting\nrepresentations. Our method is based on the observation that discovering\nmulti-view consistency in advance can determine the disentangling information\nboundary, leading to a decoupled learning objective. We also found that the\nconsistency can be easily extracted by maximizing the transformation invariance\nand clustering consistency between views. These observations drive us to\npropose a two-stage framework. In the first stage, we obtain multi-view\nconsistency by training a consistent encoder to produce semantically-consistent\nrepresentations across views as well as their corresponding pseudo-labels. In\nthe second stage, we disentangle specificity from comprehensive representations\nby minimizing the upper bound of mutual information between consistent and\ncomprehensive representations. Finally, we reconstruct the original data by\nconcatenating pseudo-labels and view-specific representations. Our experiments\non four multi-view datasets demonstrate that our proposed method outperforms 12\ncomparison methods in terms of clustering and classification performance. The\nvisualization results also show that the extracted consistency and specificity\nare compact and interpretable. Our code can be found at\n\\url{https://github.com/Guanzhou-Ke/DMRIB}.\n","authors":["Guanzhou Ke","Yang Yu","Guoqing Chao","Xiaoli Wang","Chenyang Xu","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.01634v2.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2212.13726v4","updated":"2023-08-04T13:20:43Z","published":"2022-12-28T07:21:05Z","title":"A Clustering-guided Contrastive Fusion for Multi-view Representation\n  Learning","summary":"  The past two decades have seen increasingly rapid advances in the field of\nmulti-view representation learning due to it extracting useful information from\ndiverse domains to facilitate the development of multi-view applications.\nHowever, the community faces two challenges: i) how to learn robust\nrepresentations from a large amount of unlabeled data to against noise or\nincomplete views setting, and ii) how to balance view consistency and\ncomplementary for various downstream tasks. To this end, we utilize a deep\nfusion network to fuse view-specific representations into the view-common\nrepresentation, extracting high-level semantics for obtaining robust\nrepresentation. In addition, we employ a clustering task to guide the fusion\nnetwork to prevent it from leading to trivial solutions. For balancing\nconsistency and complementary, then, we design an asymmetrical contrastive\nstrategy that aligns the view-common representation and each view-specific\nrepresentation. These modules are incorporated into a unified method known as\nCLustering-guided cOntrastiVE fusioN (CLOVEN). We quantitatively and\nqualitatively evaluate the proposed method on five datasets, demonstrating that\nCLOVEN outperforms 11 competitive multi-view learning methods in clustering and\nclassification. In the incomplete view scenario, our proposed method resists\nnoise interference better than those of our competitors. Furthermore, the\nvisualization analysis shows that CLOVEN can preserve the intrinsic structure\nof view-specific representation while also improving the compactness of\nview-commom representation. Our source code will be available soon at\nhttps://github.com/guanzhou-ke/cloven.\n","authors":["Guanzhou Ke","Guoqing Chao","Xiaoli Wang","Chenyang Xu","Yongqi Zhu","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2212.13726v4.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.12964v2","updated":"2023-08-04T13:04:24Z","published":"2023-05-22T12:13:08Z","title":"Text-based Person Search without Parallel Image-Text Data","summary":"  Text-based person search (TBPS) aims to retrieve the images of the target\nperson from a large image gallery based on a given natural language\ndescription. Existing methods are dominated by training models with parallel\nimage-text pairs, which are very costly to collect. In this paper, we make the\nfirst attempt to explore TBPS without parallel image-text data ($\\mu$-TBPS), in\nwhich only non-parallel images and texts, or even image-only data, can be\nadopted. Towards this end, we propose a two-stage framework,\ngeneration-then-retrieval (GTR), to first generate the corresponding pseudo\ntext for each image and then perform the retrieval in a supervised manner. In\nthe generation stage, we propose a fine-grained image captioning strategy to\nobtain an enriched description of the person image, which firstly utilizes a\nset of instruction prompts to activate the off-the-shelf pretrained\nvision-language model to capture and generate fine-grained person attributes,\nand then converts the extracted attributes into a textual description via the\nfinetuned large language model or the hand-crafted template. In the retrieval\nstage, considering the noise interference of the generated texts for training\nmodel, we develop a confidence score-based training scheme by enabling more\nreliable texts to contribute more during the training. Experimental results on\nmultiple TBPS benchmarks (i.e., CUHK-PEDES, ICFG-PEDES and RSTPReid) show that\nthe proposed GTR can achieve a promising performance without relying on\nparallel image-text data.\n","authors":["Yang Bai","Jingyao Wang","Min Cao","Chen Chen","Ziqiang Cao","Liqiang Nie","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.12964v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.02283v1","updated":"2023-08-04T12:29:49Z","published":"2023-08-04T12:29:49Z","title":"Diffusion-Augmented Depth Prediction with Sparse Annotations","summary":"  Depth estimation aims to predict dense depth maps. In autonomous driving\nscenes, sparsity of annotations makes the task challenging. Supervised models\nproduce concave objects due to insufficient structural information. They\noverfit to valid pixels and fail to restore spatial structures. Self-supervised\nmethods are proposed for the problem. Their robustness is limited by pose\nestimation, leading to erroneous results in natural scenes. In this paper, we\npropose a supervised framework termed Diffusion-Augmented Depth Prediction\n(DADP). We leverage the structural characteristics of diffusion model to\nenforce depth structures of depth models in a plug-and-play manner. An\nobject-guided integrality loss is also proposed to further enhance regional\nstructure integrality by fetching objective information. We evaluate DADP on\nthree driving benchmarks and achieve significant improvements in depth\nstructures and robustness. Our work provides a new perspective on depth\nestimation with sparse annotations in autonomous driving scenes.\n","authors":["Jiaqi Li","Yiran Wang","Zihao Huang","Jinghong Zheng","Ke Xian","Zhiguo Cao","Jianming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02283v1.pdf","comment":"Accepted by ACM MM'2023"},{"id":"http://arxiv.org/abs/2212.01261v3","updated":"2023-08-04T11:50:14Z","published":"2022-12-02T15:57:36Z","title":"Generative Reasoning Integrated Label Noise Robust Deep Image\n  Representation Learning","summary":"  The development of deep learning based image representation learning (IRL)\nmethods has attracted great attention for various image understanding problems.\nMost of these methods require the availability of a high quantity and quality\nof annotated training images, which can be time-consuming and costly to gather.\nTo reduce labeling costs, crowdsourced data, automatic labeling procedures or\ncitizen science projects can be considered. However, such approaches increase\nthe risk of including label noise in training data. It may result in\noverfitting on noisy labels when discriminative reasoning is employed. This\nleads to sub-optimal learning procedures, and thus inaccurate characterization\nof images. To address this, we introduce a generative reasoning integrated\nlabel noise robust deep representation learning (GRID) approach. Our approach\naims to model the complementary characteristics of discriminative and\ngenerative reasoning for IRL under noisy labels. To this end, we first\nintegrate generative reasoning into discriminative reasoning through a\nsupervised variational autoencoder. This allows GRID to automatically detect\ntraining samples with noisy labels. Then, through our label noise robust hybrid\nrepresentation learning strategy, GRID adjusts the whole learning procedure for\nIRL of these samples through generative reasoning and that of other samples\nthrough discriminative reasoning. Our approach learns discriminative image\nrepresentations while preventing interference of noisy labels independently\nfrom the IRL method being selected. Thus, unlike the existing methods, GRID\ndoes not depend on the type of annotation, neural network architecture, loss\nfunction or learning task, and thus can be directly utilized for various\nproblems. Experimental results show its effectiveness compared to\nstate-of-the-art methods. The code of GRID is publicly available at\nhttps://github.com/gencersumbul/GRID.\n","authors":["Gencer Sumbul","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2212.01261v3.pdf","comment":"Accepted at the IEEE Transactions on Image Processing. Our code is\n  available at https://github.com/gencersumbul/GRID"},{"id":"http://arxiv.org/abs/2308.02266v1","updated":"2023-08-04T11:44:08Z","published":"2023-08-04T11:44:08Z","title":"SURE-Val: Safe Urban Relevance Extension and Validation","summary":"  To evaluate perception components of an automated driving system, it is\nnecessary to define the relevant objects. While the urban domain is popular\namong perception datasets, relevance is insufficiently specified for this\ndomain. Therefore, this work adopts an existing method to define relevance in\nthe highway domain and expands it to the urban domain. While different\nconceptualizations and definitions of relevance are present in literature,\nthere is a lack of methods to validate these definitions. Therefore, this work\npresents a novel relevance validation method leveraging a motion prediction\ncomponent. The validation leverages the idea that removing irrelevant objects\nshould not influence a prediction component which reflects human driving\nbehavior. The influence on the prediction is quantified by considering the\nstatistical distribution of prediction performance across a large-scale\ndataset. The validation procedure is verified using criteria specifically\ndesigned to exclude relevant objects. The validation method is successfully\napplied to the relevance criteria from this work, thus supporting their\nvalidity.\n","authors":["Kai Storms","Ken Mori","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2308.02266v1.pdf","comment":"Accepted at Uni-DAS e.V. Workshop Fahrerassistenz und automatisiertes\n  Fahren"},{"id":"http://arxiv.org/abs/2308.02248v1","updated":"2023-08-04T10:59:24Z","published":"2023-08-04T10:59:24Z","title":"On the Calibration of Uncertainty Estimation in LiDAR-based Semantic\n  Segmentation","summary":"  The confidence calibration of deep learning-based perception models plays a\ncrucial role in their reliability. Especially in the context of autonomous\ndriving, downstream tasks like prediction and planning depend on accurate\nconfidence estimates. In point-wise multiclass classification tasks like\nsematic segmentation the model has to deal with heavy class imbalances. Due to\ntheir underrepresentation, the confidence calibration of classes with smaller\ninstances is challenging but essential, not only for safety reasons. We propose\na metric to measure the confidence calibration quality of a semantic\nsegmentation model with respect to individual classes. It is calculated by\ncomputing sparsification curves for each class based on the uncertainty\nestimates. We use the classification calibration metric to evaluate uncertainty\nestimation methods with respect to their confidence calibration of\nunderrepresented classes. We furthermore suggest a double use for the method to\nautomatically find label problems to improve the quality of hand- or\nauto-annotated datasets.\n","authors":["Mariella Dreissig","Florian Piewak","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2308.02248v1.pdf","comment":"accepted at IEEE ITSC 2023"},{"id":"http://arxiv.org/abs/2303.08757v3","updated":"2023-08-04T10:40:46Z","published":"2023-03-15T16:53:19Z","title":"CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in\n  Patients With Suspected Ischemic Stroke","summary":"  Precise and fast prediction methods for ischemic areas comprised of dead\ntissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)\npatients are of significant clinical interest. They play an essential role in\nimproving diagnosis and treatment planning. Computed Tomography (CT) scan is\none of the primary modalities for early assessment in patients with suspected\nAIS. CT Perfusion (CTP) is often used as a primary assessment to determine\nstroke location, severity, and volume of ischemic lesions. Current automatic\nsegmentation methods for CTP mostly use already processed 3D parametric maps\nconventionally used for clinical interpretation by radiologists as input.\nAlternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time\ninput, where the spatial information over the volume is ignored. In addition,\nthese methods are only interested in segmenting core regions, while predicting\npenumbra can be essential for treatment planning. This paper investigates\ndifferent methods to utilize the entire 4D CTP as input to fully exploit the\nspatio-temporal information, leading us to propose a novel 4D convolution\nlayer. Our comprehensive experiments on a local dataset of 152 patients divided\ninto three groups show that our proposed models generate more precise results\nthan other methods explored. Adopting the proposed 4D mJ-Net, a Dice\nCoefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core\nareas, respectively. The code is available on\nhttps://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.\n","authors":["Luca Tomasetti","Kjersti Engan","Liv Jorunn Høllesli","Kathinka Dæhli Kurz","Mahdieh Khanmohammadi"],"pdf_url":"https://arxiv.org/pdf/2303.08757v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02239v1","updated":"2023-08-04T10:35:40Z","published":"2023-08-04T10:35:40Z","title":"DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via\n  Deformable Template Field","summary":"  Estimating 6D poses and reconstructing 3D shapes of objects in open-world\nscenes from RGB-depth image pairs is challenging. Many existing methods rely on\nlearning geometric features that correspond to specific templates while\ndisregarding shape variations and pose differences among objects in the same\ncategory. As a result, these methods underperform when handling unseen object\ninstances in complex environments. In contrast, other approaches aim to achieve\ncategory-level estimation and reconstruction by leveraging normalized geometric\nstructure priors, but the static prior-based reconstruction struggles with\nsubstantial intra-class variations. To solve these problems, we propose the\nDTF-Net, a novel framework for pose estimation and shape reconstruction based\non implicit neural fields of object categories. In DTF-Net, we design a\ndeformable template field to represent the general category-wise shape latent\nfeatures and intra-category geometric deformation features. The field\nestablishes continuous shape correspondences, deforming the category template\ninto arbitrary observed instances to accomplish shape reconstruction. We\nintroduce a pose regression module that shares the deformation features and\ntemplate codes from the fields to estimate the accurate 6D pose of each object\nin the scene. We integrate a multi-modal representation extraction module to\nextract object features and semantic masks, enabling end-to-end inference.\nMoreover, during training, we implement a shape-invariant training strategy and\na viewpoint sampling method to further enhance the model's capability to\nextract object pose features. Extensive experiments on the REAL275 and CAMERA25\ndatasets demonstrate the superiority of DTF-Net in both synthetic and real\nscenes. Furthermore, we show that DTF-Net effectively supports grasping tasks\nwith a real robot arm.\n","authors":["Haowen Wang","Zhipeng Fan","Zhen Zhao","Zhengping Che","Zhiyuan Xu","Dong Liu","Feifei Feng","Yakun Huang","Xiuquan Qiao","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2308.02239v1.pdf","comment":"The first two authors are with equal contributions. Paper accepted by\n  ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.02237v1","updated":"2023-08-04T10:26:59Z","published":"2023-08-04T10:26:59Z","title":"MSECNet: Accurate and Robust Normal Estimation for 3D Point Clouds by\n  Multi-Scale Edge Conditioning","summary":"  Estimating surface normals from 3D point clouds is critical for various\napplications, including surface reconstruction and rendering. While existing\nmethods for normal estimation perform well in regions where normals change\nslowly, they tend to fail where normals vary rapidly. To address this issue, we\npropose a novel approach called MSECNet, which improves estimation in normal\nvarying regions by treating normal variation modeling as an edge detection\nproblem. MSECNet consists of a backbone network and a multi-scale edge\nconditioning (MSEC) stream. The MSEC stream achieves robust edge detection\nthrough multi-scale feature fusion and adaptive edge detection. The detected\nedges are then combined with the output of the backbone network using the edge\nconditioning module to produce edge-aware representations. Extensive\nexperiments show that MSECNet outperforms existing methods on both synthetic\n(PCPNet) and real-world (SceneNN) datasets while running significantly faster.\nWe also conduct various analyses to investigate the contribution of each\ncomponent in the MSEC stream. Finally, we demonstrate the effectiveness of our\napproach in surface reconstruction.\n","authors":["Haoyi Xiu","Xin Liu","Weimin Wang","Kyoung-Sook Kim","Masashi Matsuoka"],"pdf_url":"https://arxiv.org/pdf/2308.02237v1.pdf","comment":"Accepted for ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.02236v1","updated":"2023-08-04T10:26:55Z","published":"2023-08-04T10:26:55Z","title":"FB-BEV: BEV Representation from Forward-Backward View Transformations","summary":"  View Transformation Module (VTM), where transformations happen between\nmulti-view image features and Bird-Eye-View (BEV) representation, is a crucial\nstep in camera-based BEV perception systems. Currently, the two most prominent\nVTM paradigms are forward projection and backward projection. Forward\nprojection, represented by Lift-Splat-Shoot, leads to sparsely projected BEV\nfeatures without post-processing. Backward projection, with BEVFormer being an\nexample, tends to generate false-positive BEV features from incorrect\nprojections due to the lack of utilization on depth. To address the above\nlimitations, we propose a novel forward-backward view transformation module.\nOur approach compensates for the deficiencies in both existing methods,\nallowing them to enhance each other to obtain higher quality BEV\nrepresentations mutually. We instantiate the proposed module with FB-BEV, which\nachieves a new state-of-the-art result of 62.4\\% NDS on the nuScenes test set.\nThe code will be released at \\url{https://github.com/NVlabs/FB-BEV}.\n","authors":["Zhiqi Li","Zhiding Yu","Wenhai Wang","Anima Anandkumar","Tong Lu","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2308.02236v1.pdf","comment":"Accept to ICCV 2023"},{"id":"http://arxiv.org/abs/2305.01255v2","updated":"2023-08-04T09:57:56Z","published":"2023-05-02T08:36:02Z","title":"RT-K-Net: Revisiting K-Net for Real-Time Panoptic Segmentation","summary":"  Panoptic segmentation is one of the most challenging scene parsing tasks,\ncombining the tasks of semantic segmentation and instance segmentation. While\nmuch progress has been made, few works focus on the real-time application of\npanoptic segmentation methods. In this paper, we revisit the recently\nintroduced K-Net architecture. We propose vital changes to the architecture,\ntraining, and inference procedure, which massively decrease latency and improve\nperformance. Our resulting RT-K-Net sets a new state-of-the-art performance for\nreal-time panoptic segmentation methods on the Cityscapes dataset and shows\npromising results on the challenging Mapillary Vistas dataset. On Cityscapes,\nRT-K-Net reaches 60.2 % PQ with an average inference time of 32 ms for full\nresolution 1024x2048 pixel images on a single Titan RTX GPU. On Mapillary\nVistas, RT-K-Net reaches 33.2 % PQ with an average inference time of 69 ms.\nSource code is available at https://github.com/markusschoen/RT-K-Net.\n","authors":["Markus Schön","Michael Buchholz","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2305.01255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12850v7","updated":"2023-08-04T09:54:51Z","published":"2022-07-26T12:31:01Z","title":"SSIVD-Net: A Novel Salient Super Image Classification & Detection\n  Technique for Weaponized Violence","summary":"  Detection of violence and weaponized violence in closed-circuit television\n(CCTV) footage requires a comprehensive approach. In this work, we introduce\nthe \\emph{Smart-City CCTV Violence Detection (SCVD)} dataset, specifically\ndesigned to facilitate the learning of weapon distribution in surveillance\nvideos. To tackle the complexities of analyzing 3D surveillance video for\nviolence recognition tasks, we propose a novel technique called,\n\\emph{SSIVD-Net} (\\textbf{S}alient-\\textbf{S}uper-\\textbf{I}mage for\n\\textbf{V}iolence \\textbf{D}etection). Our method reduces 3D video data\ncomplexity, dimensionality, and information loss while improving inference,\nperformance, and explainability through the use of Salient-Super-Image\nrepresentations. Considering the scalability and sustainability requirements of\nfuturistic smart cities, the authors introduce the \\emph{Salient-Classifier}, a\nnovel architecture combining a kernelized approach with a residual learning\nstrategy. We evaluate variations of SSIVD-Net and Salient Classifier on our\nSCVD dataset and benchmark against state-of-the-art (SOTA) models commonly\nemployed in violence detection. Our approach exhibits significant improvements\nin detecting both weaponized and non-weaponized violence instances. By\nadvancing the SOTA in violence detection, our work offers a practical and\nscalable solution suitable for real-world applications. The proposed\nmethodology not only addresses the challenges of violence detection in CCTV\nfootage but also contributes to the understanding of weapon distribution in\nsmart surveillance. Ultimately, our research findings should enable smarter and\nmore secure cities, as well as enhance public safety measures.\n","authors":["Toluwani Aremu","Li Zhiyuan","Reem Alameeri","Mustaqeem Khan","Abdulmotaleb El Saddik"],"pdf_url":"https://arxiv.org/pdf/2207.12850v7.pdf","comment":"5 tables, 3 figures"},{"id":"http://arxiv.org/abs/2308.02228v1","updated":"2023-08-04T09:51:57Z","published":"2023-08-04T09:51:57Z","title":"Painterly Image Harmonization using Diffusion Model","summary":"  Painterly image harmonization aims to insert photographic objects into\npaintings and obtain artistically coherent composite images. Previous methods\nfor this task mainly rely on inference optimization or generative adversarial\nnetwork, but they are either very time-consuming or struggling at fine control\nof the foreground objects (e.g., texture and content details). To address these\nissues, we propose a novel Painterly Harmonization stable Diffusion model\n(PHDiffusion), which includes a lightweight adaptive encoder and a Dual Encoder\nFusion (DEF) module. Specifically, the adaptive encoder and the DEF module\nfirst stylize foreground features within each encoder. Then, the stylized\nforeground features from both encoders are combined to guide the harmonization\nprocess. During training, besides the noise loss in diffusion model, we\nadditionally employ content loss and two style losses, i.e., AdaIN style loss\nand contrastive style loss, aiming to balance the trade-off between style\nmigration and content preservation. Compared with the state-of-the-art models\nfrom related fields, our PHDiffusion can stylize the foreground more\nsufficiently and simultaneously retain finer content. Our code and model are\navailable at https://github.com/bcmi/PHDiffusion-Painterly-Image-Harmonization.\n","authors":["Lingxiao Lu","Jiangtong Li","Junyan Cao","Li Niu","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02228v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.02225v1","updated":"2023-08-04T09:42:14Z","published":"2023-08-04T09:42:14Z","title":"Deep Semantic Model Fusion for Ancient Agricultural Terrace Detection","summary":"  Discovering ancient agricultural terraces in desert regions is important for\nthe monitoring of long-term climate changes on the Earth's surface. However,\ntraditional ground surveys are both costly and limited in scale. With the\nincreasing accessibility of aerial and satellite data, machine learning\ntechniques bear large potential for the automatic detection and recognition of\narchaeological landscapes. In this paper, we propose a deep semantic model\nfusion method for ancient agricultural terrace detection. The input data\nincludes aerial images and LiDAR generated terrain features in the Negev\ndesert. Two deep semantic segmentation models, namely DeepLabv3+ and UNet, with\nEfficientNet backbone, are trained and fused to provide segmentation maps of\nancient terraces and walls. The proposed method won the first prize in the\nInternational AI Archaeology Challenge. Codes are available at\nhttps://github.com/wangyi111/international-archaeology-ai-challenge.\n","authors":["Yi Wang","Chenying Liu","Arti Tiwari","Micha Silver","Arnon Karnieli","Xiao Xiang Zhu","Conrad M Albrecht"],"pdf_url":"https://arxiv.org/pdf/2308.02225v1.pdf","comment":"IEEE Big Data 2022 workshop on Digital Twins for Accelerated\n  Discovery of Climate & Sustainability Solutions (ADoCS)"},{"id":"http://arxiv.org/abs/2305.15030v2","updated":"2023-08-04T09:29:35Z","published":"2023-05-24T11:14:40Z","title":"Jointly Optimizing Image Compression with Low-light Image Enhancement","summary":"  Learning-based image compression methods have made great progress. Most of\nthem are designed for generic natural images. In fact, low-light images\nfrequently occur due to unavoidable environmental influences or technical\nlimitations, such as insufficient lighting or limited exposure time. %When\ngeneral-purpose image compression algorithms compress low-light images, useful\ndetail information is lost, resulting in a dramatic decrease in image\nenhancement. Once low-light images are compressed by existing general image\ncompression approaches, useful information(e.g., texture details) would be lost\nresulting in a dramatic performance decrease in low-light image enhancement. To\nsimultaneously achieve a higher compression rate and better enhancement\nperformance for low-light images, we propose a novel image compression\nframework with joint optimization of low-light image enhancement. We design an\nend-to-end trainable two-branch architecture with lower computational cost,\nwhich includes the main enhancement branch and the signal-to-noise ratio~(SNR)\naware branch. Experimental results show that our proposed joint optimization\nframework achieves a significant improvement over existing ``Compress before\nEnhance\" or ``Enhance before Compress\" sequential solutions for low-light\nimages. Source codes are included in the supplementary material.\n","authors":["Shilv Cai","Xu Zou","Liqun Chen","Luxin Yan","Sheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2305.15030v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.06705 by other authors"},{"id":"http://arxiv.org/abs/2304.10224v2","updated":"2023-08-04T09:19:43Z","published":"2023-04-20T11:39:41Z","title":"Multi-view Vision-Prompt Fusion Network: Can 2D Pre-trained Model Boost\n  3D Point Cloud Data-scarce Learning?","summary":"  Point cloud based 3D deep model has wide applications in many applications\nsuch as autonomous driving, house robot, and so on. Inspired by the recent\nprompt learning in natural language processing, this work proposes a novel\nMulti-view Vision-Prompt Fusion Network (MvNet) for few-shot 3D point cloud\nclassification. MvNet investigates the possibility of leveraging the\noff-the-shelf 2D pre-trained models to achieve the few-shot classification,\nwhich can alleviate the over-dependence issue of the existing baseline models\ntowards the large-scale annotated 3D point cloud data. Specifically, MvNet\nfirst encodes a 3D point cloud into multi-view image features for a number of\ndifferent views. Then, a novel multi-view prompt fusion module is developed to\neffectively fuse information from different views to bridge the gap between 3D\npoint cloud data and 2D pre-trained models. A set of 2D image prompts can then\nbe derived to better describe the suitable prior knowledge for a large-scale\npre-trained image model for few-shot 3D point cloud classification. Extensive\nexperiments on ModelNet, ScanObjectNN, and ShapeNet datasets demonstrate that\nMvNet achieves new state-of-the-art performance for 3D few-shot point cloud\nimage classification. The source code of this work will be available soon.\n","authors":["Haoyang Peng","Baopu Li","Bo Zhang","Xin Chen","Tao Chen","Hongyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2304.10224v2.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2308.02213v1","updated":"2023-08-04T09:11:07Z","published":"2023-08-04T09:11:07Z","title":"Balanced Classification: A Unified Framework for Long-Tailed Object\n  Detection","summary":"  Conventional detectors suffer from performance degradation when dealing with\nlong-tailed data due to a classification bias towards the majority head\ncategories. In this paper, we contend that the learning bias originates from\ntwo factors: 1) the unequal competition arising from the imbalanced\ndistribution of foreground categories, and 2) the lack of sample diversity in\ntail categories. To tackle these issues, we introduce a unified framework\ncalled BAlanced CLassification (BACL), which enables adaptive rectification of\ninequalities caused by disparities in category distribution and dynamic\nintensification of sample diversities in a synchronized manner. Specifically, a\nnovel foreground classification balance loss (FCBL) is developed to ameliorate\nthe domination of head categories and shift attention to\ndifficult-to-differentiate categories by introducing pairwise class-aware\nmargins and auto-adjusted weight terms, respectively. This loss prevents the\nover-suppression of tail categories in the context of unequal competition.\nMoreover, we propose a dynamic feature hallucination module (FHM), which\nenhances the representation of tail categories in the feature space by\nsynthesizing hallucinated samples to introduce additional data variances. In\nthis divide-and-conquer approach, BACL sets a new state-of-the-art on the\nchallenging LVIS benchmark with a decoupled training pipeline, surpassing\nvanilla Faster R-CNN with ResNet-50-FPN by 5.8% AP and 16.1% AP for overall and\ntail categories. Extensive experiments demonstrate that BACL consistently\nachieves performance improvements across various datasets with different\nbackbones and architectures. Code and models are available at\nhttps://github.com/Tianhao-Qi/BACL.\n","authors":["Tianhao Qi","Hongtao Xie","Pandeng Li","Jiannan Ge","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02213v1.pdf","comment":"Accepted by IEEE Transactions on Multimedia, to be published; Code:\n  https://github.com/Tianhao-Qi/BACL"},{"id":"http://arxiv.org/abs/2206.11459v2","updated":"2023-08-04T08:43:08Z","published":"2022-06-23T02:39:09Z","title":"Explore Spatio-temporal Aggregation for Insubstantial Object Detection:\n  Benchmark Dataset and Baseline","summary":"  We endeavor on a rarely explored task named Insubstantial Object Detection\n(IOD), which aims to localize the object with following characteristics: (1)\namorphous shape with indistinct boundary; (2) similarity to surroundings; (3)\nabsence in color. Accordingly, it is far more challenging to distinguish\ninsubstantial objects in a single static frame and the collaborative\nrepresentation of spatial and temporal information is crucial. Thus, we\nconstruct an IOD-Video dataset comprised of 600 videos (141,017 frames)\ncovering various distances, sizes, visibility, and scenes captured by different\nspectral ranges. In addition, we develop a spatio-temporal aggregation\nframework for IOD, in which different backbones are deployed and a\nspatio-temporal aggregation loss (STAloss) is elaborately designed to leverage\nthe consistency along the time axis. Experiments conducted on IOD-Video dataset\ndemonstrate that spatio-temporal aggregation can significantly improve the\nperformance of IOD. We hope our work will attract further researches into this\nvaluable yet challenging task. The code will be available at:\n\\url{https://github.com/CalayZhou/IOD-Video}.\n","authors":["Kailai Zhou","Yibo Wang","Tao Lv","Yunqian Li","Linsen Chen","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2206.11459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.00919v3","updated":"2023-08-04T08:36:02Z","published":"2022-08-01T15:05:26Z","title":"Benchmarking Visual-Inertial Deep Multimodal Fusion for Relative Pose\n  Regression and Odometry-aided Absolute Pose Regression","summary":"  Visual-inertial localization is a key problem in computer vision and robotics\napplications such as virtual reality, self-driving cars, and aerial vehicles.\nThe goal is to estimate an accurate pose of an object when either the\nenvironment or the dynamics are known. Absolute pose regression (APR)\ntechniques directly regress the absolute pose from an image input in a known\nscene using convolutional and spatio-temporal networks. Odometry methods\nperform relative pose regression (RPR) that predicts the relative pose from a\nknown object dynamic (visual or inertial inputs). The localization task can be\nimproved by retrieving information from both data sources for a cross-modal\nsetup, which is a challenging problem due to contradictory tasks. In this work,\nwe conduct a benchmark to evaluate deep multimodal fusion based on pose graph\noptimization and attention networks. Auxiliary and Bayesian learning are\nutilized for the APR task. We show accuracy improvements for the APR-RPR task\nand for the RPR-RPR task for aerial vehicles and hand-held devices. We conduct\nexperiments on the EuRoC MAV and PennCOSYVIO datasets and record and evaluate a\nnovel industry dataset.\n","authors":["Felix Ott","Nisha Lakshmana Raichur","David Rügamer","Tobias Feigl","Heiko Neumann","Bernd Bischl","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2208.00919v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.01854v2","updated":"2023-08-04T08:34:23Z","published":"2023-08-03T16:20:33Z","title":"Reconstructing Three-Dimensional Models of Interacting Humans","summary":"  Understanding 3d human interactions is fundamental for fine-grained scene\nanalysis and behavioural modeling. However, most of the existing models predict\nincorrect, lifeless 3d estimates, that miss the subtle human contact\naspects--the essence of the event--and are of little use for detailed\nbehavioral understanding. This paper addresses such issues with several\ncontributions: (1) we introduce models for interaction signature estimation\n(ISP) encompassing contact detection, segmentation, and 3d contact signature\nprediction; (2) we show how such components can be leveraged to ensure contact\nconsistency during 3d reconstruction; (3) we construct several large datasets\nfor learning and evaluating 3d contact prediction and reconstruction methods;\nspecifically, we introduce CHI3D, a lab-based accurate 3d motion capture\ndataset with 631 sequences containing $2,525$ contact events, $728,664$ ground\ntruth 3d poses, as well as FlickrCI3D, a dataset of $11,216$ images, with\n$14,081$ processed pairs of people, and $81,233$ facet-level surface\ncorrespondences. Finally, (4) we propose methodology for recovering the\nground-truth pose and shape of interacting people in a controlled setup and (5)\nannotate all 3d interaction motions in CHI3D with textual descriptions. Motion\ndata in multiple formats (GHUM and SMPLX parameters, Human3.6m 3d joints) is\nmade available for research purposes at \\url{https://ci3d.imar.ro}, together\nwith an evaluation server and a public benchmark.\n","authors":["Mihai Fieraru","Mihai Zanfir","Elisabeta Oneata","Alin-Ionut Popa","Vlad Olaru","Cristian Sminchisescu"],"pdf_url":"https://arxiv.org/pdf/2308.01854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02194v1","updated":"2023-08-04T08:20:54Z","published":"2023-08-04T08:20:54Z","title":"Paired Competing Neurons Improving STDP Supervised Local Learning In\n  Spiking Neural Networks","summary":"  Direct training of Spiking Neural Networks (SNNs) on neuromorphic hardware\nhas the potential to significantly reduce the high energy consumption of\nArtificial Neural Networks (ANNs) training on modern computers. The biological\nplausibility of SNNs allows them to benefit from bio-inspired plasticity rules,\nsuch as Spike Timing-Dependent Plasticity (STDP). STDP offers gradient-free and\nunsupervised local learning, which can be easily implemented on neuromorphic\nhardware. However, relying solely on unsupervised STDP to perform\nclassification tasks is not enough. In this paper, we propose Stabilized\nSupervised STDP (S2-STDP), a supervised STDP learning rule to train the\nclassification layer of an SNN equipped with unsupervised STDP. S2-STDP\nintegrates error-modulated weight updates that align neuron spikes with desired\ntimestamps derived from the average firing time within the layer. Then, we\nintroduce a training architecture called Paired Competing Neurons (PCN) to\nfurther enhance the learning capabilities of our classification layer trained\nwith S2-STDP. PCN associates each class with paired neurons and encourages\nneuron specialization through intra-class competition. We evaluated our\nproposed methods on image recognition datasets, including MNIST, Fashion-MNIST,\nand CIFAR-10. Results showed that our methods outperform current supervised\nSTDP-based state of the art, for comparable architectures and numbers of\nneurons. Also, the use of PCN enhances the performance of S2-STDP, regardless\nof the configuration, and without introducing any hyperparameters.Further\nanalysis demonstrated that our methods exhibited improved hyperparameter\nrobustness, which reduces the need for tuning.\n","authors":["Gaspard Goupy","Pierre Tirilly","Ioan Marius Bilasco"],"pdf_url":"https://arxiv.org/pdf/2308.02194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02191v1","updated":"2023-08-04T08:16:47Z","published":"2023-08-04T08:16:47Z","title":"ES-MVSNet: Efficient Framework for End-to-end Self-supervised Multi-View\n  Stereo","summary":"  Compared to the multi-stage self-supervised multi-view stereo (MVS) method,\nthe end-to-end (E2E) approach has received more attention due to its concise\nand efficient training pipeline. Recent E2E self-supervised MVS approaches have\nintegrated third-party models (such as optical flow models, semantic\nsegmentation models, NeRF models, etc.) to provide additional consistency\nconstraints, which grows GPU memory consumption and complicates the model's\nstructure and training pipeline. In this work, we propose an efficient\nframework for end-to-end self-supervised MVS, dubbed ES-MVSNet. To alleviate\nthe high memory consumption of current E2E self-supervised MVS frameworks, we\npresent a memory-efficient architecture that reduces memory usage by 43%\nwithout compromising model performance. Furthermore, with the novel design of\nasymmetric view selection policy and region-aware depth consistency, we achieve\nstate-of-the-art performance among E2E self-supervised MVS methods, without\nrelying on third-party models for additional consistency signals. Extensive\nexperiments on DTU and Tanks&Temples benchmarks demonstrate that the proposed\nES-MVSNet approach achieves state-of-the-art performance among E2E\nself-supervised MVS methods and competitive performance to many supervised and\nmulti-stage self-supervised methods.\n","authors":["Qiang Zhou","Chaohui Yu","Jingliang Li","Yuang Liu","Jing Wang","Zhibin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02191v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2203.03949 by other authors"},{"id":"http://arxiv.org/abs/2308.02184v1","updated":"2023-08-04T07:55:32Z","published":"2023-08-04T07:55:32Z","title":"Synthetic outlier generation for anomaly detection in autonomous driving","summary":"  Anomaly detection, or outlier detection, is a crucial task in various domains\nto identify instances that significantly deviate from established patterns or\nthe majority of data. In the context of autonomous driving, the identification\nof anomalies is particularly important to prevent safety-critical incidents, as\ndeep learning models often exhibit overconfidence in anomalous or outlier\nsamples. In this study, we explore different strategies for training an image\nsemantic segmentation model with an anomaly detection module. By introducing\nmodifications to the training stage of the state-of-the-art DenseHybrid model,\nwe achieve significant performance improvements in anomaly detection. Moreover,\nwe propose a simplified detector that achieves comparable results to our\nmodified DenseHybrid approach, while also surpassing the performance of the\noriginal DenseHybrid model. These findings demonstrate the efficacy of our\nproposed strategies for enhancing anomaly detection in the context of\nautonomous driving.\n","authors":["Martin Bikandi","Gorka Velez","Naiara Aginako","Itziar Irigoien"],"pdf_url":"https://arxiv.org/pdf/2308.02184v1.pdf","comment":"Accepted in the 26th IEEE International Conference on Intelligent\n  Transportation Systems (ITSC 2023)"},{"id":"http://arxiv.org/abs/2308.02177v1","updated":"2023-08-04T07:39:09Z","published":"2023-08-04T07:39:09Z","title":"Scene-aware Human Pose Generation using Transformer","summary":"  Affordance learning considers the interaction opportunities for an actor in\nthe scene and thus has wide application in scene understanding and intelligent\nrobotics. In this paper, we focus on contextual affordance learning, i.e.,\nusing affordance as context to generate a reasonable human pose in a scene.\nExisting scene-aware human pose generation methods could be divided into two\ncategories depending on whether using pose templates. Our proposed method\nbelongs to the template-based category, which benefits from the representative\npose templates. Moreover, inspired by recent transformer-based methods, we\nassociate each query embedding with a pose template, and use the interaction\nbetween query embeddings and scene feature map to effectively predict the scale\nand offsets for each pose template. In addition, we employ knowledge\ndistillation to facilitate the offset learning given the predicted scale.\nComprehensive experiments on Sitcom dataset demonstrate the effectiveness of\nour method.\n","authors":["Jieteng Yao","Junjie Chen","Li Niu","Bin Sheng"],"pdf_url":"https://arxiv.org/pdf/2308.02177v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.02173v1","updated":"2023-08-04T07:19:08Z","published":"2023-08-04T07:19:08Z","title":"Efficient Labelling of Affective Video Datasets via Few-Shot &\n  Multi-Task Contrastive Learning","summary":"  Whilst deep learning techniques have achieved excellent emotion prediction,\nthey still require large amounts of labelled training data, which are (a)\nonerous and tedious to compile, and (b) prone to errors and biases. We propose\nMulti-Task Contrastive Learning for Affect Representation (\\textbf{MT-CLAR})\nfor few-shot affect inference. MT-CLAR combines multi-task learning with a\nSiamese network trained via contrastive learning to infer from a pair of\nexpressive facial images (a) the (dis)similarity between the facial\nexpressions, and (b) the difference in valence and arousal levels of the two\nfaces. We further extend the image-based MT-CLAR framework for automated video\nlabelling where, given one or a few labelled video frames (termed\n\\textit{support-set}), MT-CLAR labels the remainder of the video for valence\nand arousal. Experiments are performed on the AFEW-VA dataset with multiple\nsupport-set configurations; moreover, supervised learning on representations\nlearnt via MT-CLAR are used for valence, arousal and categorical emotion\nprediction on the AffectNet and AFEW-VA datasets. The results show that valence\nand arousal predictions via MT-CLAR are very comparable to the state-of-the-art\n(SOTA), and we significantly outperform SOTA with a support-set $\\approx$6\\%\nthe size of the video dataset.\n","authors":["Ravikiran Parameshwara","Ibrahim Radwan","Akshay Asthana","Iman Abbasnejad","Ramanathan Subramanian","Roland Goecke"],"pdf_url":"https://arxiv.org/pdf/2308.02173v1.pdf","comment":"10 pages, 6 figures, to be published in Proceedings of the 31st ACM\n  International Conference on Multimedia (MM '23)"},{"id":"http://arxiv.org/abs/2308.02162v1","updated":"2023-08-04T06:50:52Z","published":"2023-08-04T06:50:52Z","title":"Learning Referring Video Object Segmentation from Weak Annotation","summary":"  Referring video object segmentation (RVOS) is a task that aims to segment the\ntarget object in all video frames based on a sentence describing the object.\nPrevious RVOS methods have achieved significant performance with\ndensely-annotated datasets, whose construction is expensive and time-consuming.\nTo relieve the burden of data annotation while maintaining sufficient\nsupervision for segmentation, we propose a new annotation scheme, in which we\nlabel the frame where the object first appears with a mask and use bounding\nboxes for the subsequent frames. Based on this scheme, we propose a method to\nlearn from this weak annotation. Specifically, we design a cross frame\nsegmentation method, which uses the language-guided dynamic filters to\nthoroughly leverage the valuable mask annotation and bounding boxes. We further\ndevelop a bi-level contrastive learning method to encourage the model to learn\ndiscriminative representation at the pixel level. Extensive experiments and\nablative analyses show that our method is able to achieve competitive\nperformance without the demand of dense mask annotation. The code will be\navailable at https://github.com/wangbo-zhao/WRVOS/.\n","authors":["Wangbo Zhao","Kepan Nan","Songyang Zhang","Kai Chen","Dahua Lin","Yang You"],"pdf_url":"https://arxiv.org/pdf/2308.02162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02161v1","updated":"2023-08-04T06:41:35Z","published":"2023-08-04T06:41:35Z","title":"M2Former: Multi-Scale Patch Selection for Fine-Grained Visual\n  Recognition","summary":"  Recently, vision Transformers (ViTs) have been actively applied to\nfine-grained visual recognition (FGVR). ViT can effectively model the\ninterdependencies between patch-divided object regions through an inherent\nself-attention mechanism. In addition, patch selection is used with ViT to\nremove redundant patch information and highlight the most discriminative object\npatches. However, existing ViT-based FGVR models are limited to single-scale\nprocessing, and their fixed receptive fields hinder representational richness\nand exacerbate vulnerability to scale variability. Therefore, we propose\nmulti-scale patch selection (MSPS) to improve the multi-scale capabilities of\nexisting ViT-based models. Specifically, MSPS selects salient patches of\ndifferent scales at different stages of a multi-scale vision Transformer\n(MS-ViT). In addition, we introduce class token transfer (CTT) and multi-scale\ncross-attention (MSCA) to model cross-scale interactions between selected\nmulti-scale patches and fully reflect them in model decisions. Compared to\nprevious single-scale patch selection (SSPS), our proposed MSPS encourages\nricher object representations based on feature hierarchy and consistently\nimproves performance from small-sized to large-sized objects. As a result, we\npropose M2Former, which outperforms CNN-/ViT-based models on several widely\nused FGVR benchmarks.\n","authors":["Jiyong Moon","Junseok Lee","Yunju Lee","Seongsik Park"],"pdf_url":"https://arxiv.org/pdf/2308.02161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02158v1","updated":"2023-08-04T06:37:28Z","published":"2023-08-04T06:37:28Z","title":"CTP-Net: Character Texture Perception Network for Document Image Forgery\n  Localization","summary":"  Due to the progression of information technology in recent years, document\nimages have been widely disseminated in social networks. With the help of\npowerful image editing tools, document images are easily forged without leaving\nvisible manipulation traces, which leads to severe issues if significant\ninformation is falsified for malicious use. Therefore, the research of document\nimage forensics is worth further exploring. In a document image, the character\nwith specific semantic information is most vulnerable to tampering, for which\ncapturing the forgery traces of the character is the key to localizing the\nforged region in document images. Considering both character and image\ntextures, in this paper, we propose a Character Texture Perception Network\n(CTP-Net) to localize the forgery of document images. Based on optical\ncharacter recognition, a Character Texture Stream (CTS) is designed to capture\nfeatures of text areas that are essential components of a document image.\nMeanwhile, texture features of the whole document image are exploited by an\nImage Texture Stream (ITS). Combining the features extracted from the CTS and\nthe ITS, the CTP-Net can reveal more subtle forgery traces from document\nimages. To overcome the challenge caused by the lack of fake document images,\nwe design a data generation strategy that is utilized to construct a Fake\nChinese Trademark dataset (FCTM). Through a series of experiments, we show that\nthe proposed CTP-Net is able to capture tampering traces in document images,\nespecially in text regions. Experimental results demonstrate that CTP-Net can\nlocalize multi-scale forged areas in document images and outperform the\nstate-of-the-art forgery localization methods.\n","authors":["Xin Liao","Siliang Chen","Jiaxin Chen","Tianyi Wang","Xiehua Li"],"pdf_url":"https://arxiv.org/pdf/2308.02158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02154v1","updated":"2023-08-04T06:21:57Z","published":"2023-08-04T06:21:57Z","title":"SDDM: Score-Decomposed Diffusion Models on Manifolds for Unpaired\n  Image-to-Image Translation","summary":"  Recent score-based diffusion models (SBDMs) show promising results in\nunpaired image-to-image translation (I2I). However, existing methods, either\nenergy-based or statistically-based, provide no explicit form of the interfered\nintermediate generative distributions. This work presents a new\nscore-decomposed diffusion model (SDDM) on manifolds to explicitly optimize the\ntangled distributions during image generation. SDDM derives manifolds to make\nthe distributions of adjacent time steps separable and decompose the score\nfunction or energy guidance into an image ``denoising\" part and a content\n``refinement\" part. To refine the image in the same noise level, we equalize\nthe refinement parts of the score function and energy guidance, which permits\nmulti-objective optimization on the manifold. We also leverage the block\nadaptive instance normalization module to construct manifolds with lower\ndimensions but still concentrated with the perturbed reference image. SDDM\noutperforms existing SBDM-based methods with much fewer diffusion steps on\nseveral I2I benchmarks.\n","authors":["Shikun Sun","Longhui Wei","Junliang Xing","Jia Jia","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2308.02154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02153v1","updated":"2023-08-04T06:20:20Z","published":"2023-08-04T06:20:20Z","title":"Robust Self-Supervised Extrinsic Self-Calibration","summary":"  Autonomous vehicles and robots need to operate over a wide variety of\nscenarios in order to complete tasks efficiently and safely. Multi-camera\nself-supervised monocular depth estimation from videos is a promising way to\nreason about the environment, as it generates metrically scaled geometric\npredictions from visual data without requiring additional sensors. However,\nmost works assume well-calibrated extrinsics to fully leverage this\nmulti-camera setup, even though accurate and efficient calibration is still a\nchallenging problem. In this work, we introduce a novel method for extrinsic\ncalibration that builds upon the principles of self-supervised monocular depth\nand ego-motion learning. Our proposed curriculum learning strategy uses\nmonocular depth and pose estimators with velocity supervision to estimate\nextrinsics, and then jointly learns extrinsic calibration along with depth and\npose for a set of overlapping cameras rigidly attached to a moving vehicle.\nExperiments on a benchmark multi-camera dataset (DDAD) demonstrate that our\nmethod enables self-calibration in various scenes robustly and efficiently\ncompared to a traditional vision-based pose estimation pipeline. Furthermore,\nwe demonstrate the benefits of extrinsics self-calibration as a way to improve\ndepth prediction via joint optimization.\n","authors":["Takayuki Kanai","Igor Vasiljevic","Vitor Guizilini","Adrien Gaidon","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2308.02153v1.pdf","comment":"Project page: https://sites.google.com/tri.global/tri-sesc"},{"id":"http://arxiv.org/abs/2210.09923v2","updated":"2023-08-04T05:57:05Z","published":"2022-10-18T15:06:54Z","title":"Bridging Language and Geometric Primitives for Zero-shot Point Cloud\n  Segmentation","summary":"  We investigate transductive zero-shot point cloud semantic segmentation,\nwhere the network is trained on seen objects and able to segment unseen\nobjects. The 3D geometric elements are essential cues to imply a novel 3D\nobject type. However, previous methods neglect the fine-grained relationship\nbetween the language and the 3D geometric elements. To this end, we propose a\nnovel framework to learn the geometric primitives shared in seen and unseen\ncategories' objects and employ a fine-grained alignment between language and\nthe learned geometric primitives. Therefore, guided by language, the network\nrecognizes the novel objects represented with geometric primitives.\nSpecifically, we formulate a novel point visual representation, the similarity\nvector of the point's feature to the learnable prototypes, where the prototypes\nautomatically encode geometric primitives via back-propagation. Besides, we\npropose a novel Unknown-aware InfoNCE Loss to fine-grained align the visual\nrepresentation with language. Extensive experiments show that our method\nsignificantly outperforms other state-of-the-art methods in the harmonic\nmean-intersection-over-union (hIoU), with the improvement of 17.8\\%, 30.4\\%,\n9.2\\% and 7.9\\% on S3DIS, ScanNet, SemanticKITTI and nuScenes datasets,\nrespectively. Codes are available\n(https://github.com/runnanchen/Zero-Shot-Point-Cloud-Segmentation)\n","authors":["Runnan Chen","Xinge Zhu","Nenglun Chen","Wei Li","Yuexin Ma","Ruigang Yang","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2210.09923v2.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.12676v2","updated":"2023-08-04T04:31:25Z","published":"2023-07-24T10:30:54Z","title":"Damage Vision Mining Opportunity for Imbalanced Anomaly Detection","summary":"  In past decade, previous balanced datasets have been used to advance\nalgorithms for classification, object detection, semantic segmentation, and\nanomaly detection in industrial applications. Specifically, for condition-based\nmaintenance, automating visual inspection is crucial to ensure high quality.\nDeterioration prognostic attempts to optimize the fine decision process for\npredictive maintenance and proactive repair. In civil infrastructure and living\nenvironment, damage data mining cannot avoid the imbalanced data issue because\nof rare unseen events and high quality status by improved operations. For\nvisual inspection, deteriorated class acquired from the surface of concrete and\nsteel components are occasionally imbalanced. From numerous related surveys, we\nsummarize that imbalanced data problems can be categorized into four types; 1)\nmissing range of target and label valuables, 2) majority-minority class\nimbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class\nof pixel-wise imbalance. Since 2015, there has been many imbalanced studies\nusing deep learning approaches that includes regression, image classification,\nobject detection, semantic segmentation. However, anomaly detection for\nimbalanced data is not yet well known. In the study, we highlight one-class\nanomaly detection application whether anomalous class or not, and demonstrate\nclear examples on imbalanced vision datasets: blood smear, lung infection,\nwooden, concrete deterioration, and disaster damage. We provide key results on\ndamage vision mining advantage, hypothesizing that the more effective range of\npositive ratio, the higher accuracy gain of anomaly detection application.\nFinally, the applicability of the damage learning methods, limitations, and\nfuture works are mentioned.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v2.pdf","comment":"15 pages, 20 figures, 12 tables"},{"id":"http://arxiv.org/abs/2308.01568v2","updated":"2023-08-04T04:18:59Z","published":"2023-08-03T07:16:18Z","title":"MVFlow: Deep Optical Flow Estimation of Compressed Videos with Motion\n  Vector Prior","summary":"  In recent years, many deep learning-based methods have been proposed to\ntackle the problem of optical flow estimation and achieved promising results.\nHowever, they hardly consider that most videos are compressed and thus ignore\nthe pre-computed information in compressed video streams. Motion vectors, one\nof the compression information, record the motion of the video frames. They can\nbe directly extracted from the compression code stream without computational\ncost and serve as a solid prior for optical flow estimation. Therefore, we\npropose an optical flow model, MVFlow, which uses motion vectors to improve the\nspeed and accuracy of optical flow estimation for compressed videos. In detail,\nMVFlow includes a key Motion-Vector Converting Module, which ensures that the\nmotion vectors can be transformed into the same domain of optical flow and then\nbe utilized fully by the flow estimation module. Meanwhile, we construct four\noptical flow datasets for compressed videos containing frames and motion\nvectors in pairs. The experimental results demonstrate the superiority of our\nproposed MVFlow, which can reduce the AEPE by 1.09 compared to existing models\nor save 52% time to achieve similar accuracy to existing models.\n","authors":["Shili Zhou","Xuhao Jiang","Weimin Tan","Ruian He","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2308.01568v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.12745v2","updated":"2023-08-04T03:54:49Z","published":"2023-03-09T08:12:16Z","title":"Audio-Visual Deception Detection: DOLOS Dataset and Parameter-Efficient\n  Crossmodal Learning","summary":"  Deception detection in conversations is a challenging yet important task,\nhaving pivotal applications in many fields such as credibility assessment in\nbusiness, multimedia anti-frauds, and custom security. Despite this, deception\ndetection research is hindered by the lack of high-quality deception datasets,\nas well as the difficulties of learning multimodal features effectively. To\naddress this issue, we introduce DOLOS\\footnote {The name ``DOLOS\" comes from\nGreek mythology.}, the largest gameshow deception detection dataset with rich\ndeceptive conversations. DOLOS includes 1,675 video clips featuring 213\nsubjects, and it has been labeled with audio-visual feature annotations. We\nprovide train-test, duration, and gender protocols to investigate the impact of\ndifferent factors. We benchmark our dataset on previously proposed deception\ndetection approaches. To further improve the performance by fine-tuning fewer\nparameters, we propose Parameter-Efficient Crossmodal Learning (PECL), where a\nUniform Temporal Adapter (UT-Adapter) explores temporal attention in\ntransformer-based architectures, and a crossmodal fusion module, Plug-in\nAudio-Visual Fusion (PAVF), combines crossmodal information from audio-visual\nfeatures. Based on the rich fine-grained audio-visual annotations on DOLOS, we\nalso exploit multi-task learning to enhance performance by concurrently\npredicting deception and audio-visual features. Experimental results\ndemonstrate the desired quality of the DOLOS dataset and the effectiveness of\nthe PECL. The DOLOS dataset and the source codes are available at\nhttps://github.com/NMS05/Audio-Visual-Deception-Detection-DOLOS-Dataset-and-Parameter-Efficient-Crossmodal-Learning/tree/main.\n","authors":["Xiaobao Guo","Nithish Muthuchamy Selvaraj","Zitong Yu","Adams Wai-Kin Kong","Bingquan Shen","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2303.12745v2.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.02119v1","updated":"2023-08-04T03:18:18Z","published":"2023-08-04T03:18:18Z","title":"Attention-Driven Lightweight Model for Pigmented Skin Lesion Detection","summary":"  This study presents a lightweight pipeline for skin lesion detection,\naddressing the challenges posed by imbalanced class distribution and subtle or\natypical appearances of some lesions. The pipeline is built around a\nlightweight model that leverages ghosted features and the DFC attention\nmechanism to reduce computational complexity while maintaining high\nperformance. The model was trained on the HAM10000 dataset, which includes\nvarious types of skin lesions. To address the class imbalance in the dataset,\nthe synthetic minority over-sampling technique and various image augmentation\ntechniques were used. The model also incorporates a knowledge-based loss\nweighting technique, which assigns different weights to the loss function at\nthe class level and the instance level, helping the model focus on minority\nclasses and challenging samples. This technique involves assigning different\nweights to the loss function on two levels - the class level and the instance\nlevel. By applying appropriate loss weights, the model pays more attention to\nthe minority classes and challenging samples, thus improving its ability to\ncorrectly detect and classify different skin lesions. The model achieved an\naccuracy of 92.4%, a precision of 84.2%, a recall of 86.9%, a f1-score of 85.4%\nwith particularly strong performance in identifying Benign Keratosis-like\nlesions (BKL) and Nevus (NV). Despite its superior performance, the model's\ncomputational cost is considerably lower than some models with less accuracy,\nmaking it an optimal solution for real-world applications where both accuracy\nand efficiency are essential.\n","authors":["Mingzhe Hu","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.02119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.00596v3","updated":"2023-08-04T03:12:41Z","published":"2020-07-01T16:32:22Z","title":"A New Basis for Sparse Principal Component Analysis","summary":"  Previous versions of sparse principal component analysis (PCA) have presumed\nthat the eigen-basis (a $p \\times k$ matrix) is approximately sparse. We\npropose a method that presumes the $p \\times k$ matrix becomes approximately\nsparse after a $k \\times k$ rotation. The simplest version of the algorithm\ninitializes with the leading $k$ principal components. Then, the principal\ncomponents are rotated with an $k \\times k$ orthogonal rotation to make them\napproximately sparse. Finally, soft-thresholding is applied to the rotated\nprincipal components. This approach differs from prior approaches because it\nuses an orthogonal rotation to approximate a sparse basis. One consequence is\nthat a sparse component need not to be a leading eigenvector, but rather a\nmixture of them. In this way, we propose a new (rotated) basis for sparse PCA.\nIn addition, our approach avoids \"deflation\" and multiple tuning parameters\nrequired for that. Our sparse PCA framework is versatile; for example, it\nextends naturally to a two-way analysis of a data matrix for simultaneous\ndimensionality reduction of rows and columns. We provide evidence showing that\nfor the same level of sparsity, the proposed sparse PCA method is more stable\nand can explain more variance compared to alternative methods. Through three\napplications -- sparse coding of images, analysis of transcriptome sequencing\ndata, and large-scale clustering of social networks, we demonstrate the modern\nusefulness of sparse PCA in exploring multivariate data.\n","authors":["Fan Chen","Karl Rohe"],"pdf_url":"https://arxiv.org/pdf/2007.00596v3.pdf","comment":"50 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.06710v4","updated":"2023-08-04T03:07:41Z","published":"2023-05-11T10:36:52Z","title":"Null-text Guidance in Diffusion Models is Secretly a Cartoon-style\n  Creator","summary":"  Classifier-free guidance is an effective sampling technique in diffusion\nmodels that has been widely adopted. The main idea is to extrapolate the model\nin the direction of text guidance and away from null-text guidance. In this\npaper, we demonstrate that null-text guidance in diffusion models is secretly a\ncartoon-style creator, i.e., the generated images can be efficiently\ntransformed into cartoons by simply perturbing the null-text guidance.\nSpecifically, we proposed two disturbance methods, i.e., Rollback disturbance\n(Back-D) and Image disturbance (Image-D), to construct misalignment between the\nnoisy images used for predicting null-text guidance and text guidance\n(subsequently referred to as \\textbf{null-text noisy image} and \\textbf{text\nnoisy image} respectively) in the sampling process. Back-D achieves\ncartoonization by altering the noise level of null-text noisy image via\nreplacing $x_t$ with $x_{t+\\Delta t}$. Image-D, alternatively, produces\nhigh-fidelity, diverse cartoons by defining $x_t$ as a clean input image, which\nfurther improves the incorporation of finer image details. Through\ncomprehensive experiments, we delved into the principle of noise disturbing for\nnull-text and uncovered that the efficacy of disturbance depends on the\ncorrelation between the null-text noisy image and the source image. Moreover,\nour proposed techniques, which can generate cartoon images and cartoonize\nspecific ones, are training-free and easily integrated as a plug-and-play\ncomponent in any classifier-free guided diffusion model. Project page is\navailable at \\url{https://nulltextforcartoon.github.io/}.\n","authors":["Jing Zhao","Heliang Zheng","Chaoyue Wang","Long Lan","Wanrong Huang","Wenjing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.06710v4.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.02118v1","updated":"2023-08-04T03:04:09Z","published":"2023-08-04T03:04:09Z","title":"Rethinking Class Activation Maps for Segmentation: Revealing Semantic\n  Information in Shallow Layers by Reducing Noise","summary":"  Class activation maps are widely used for explaining deep neural networks.\nDue to its ability to highlight regions of interest, it has evolved in recent\nyears as a key step in weakly supervised learning. A major limitation to the\nperformance of the class activation maps is the small spatial resolution of the\nfeature maps in the last layer of the convolutional neural network. Therefore,\nwe expect to generate high-resolution feature maps that result in high-quality\nsemantic information. In this paper, we rethink the properties of semantic\ninformation in shallow feature maps. We find that the shallow feature maps\nstill have fine-grained non-discriminative features while mixing considerable\nnon-target noise. Furthermore, we propose a simple gradient-based denoising\nmethod to filter the noise by truncating the positive gradient. Our proposed\nscheme can be easily deployed in other CAM-related methods, facilitating these\nmethods to obtain higher-quality class activation maps. We evaluate the\nproposed approach through a weakly-supervised semantic segmentation task, and a\nlarge number of experiments demonstrate the effectiveness of our approach.\n","authors":["Hang-Cheng Dong","Yuhao Jiang","Yingyan Huang","Jingxiao Liao","Bingguo Liu","Dong Ye","Guodong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01661v2","updated":"2023-08-04T03:00:58Z","published":"2023-08-03T09:56:31Z","title":"BEVControl: Accurately Controlling Street-view Elements with\n  Multi-perspective Consistency via BEV Sketch Layout","summary":"  Using synthesized images to boost the performance of perception models is a\nlong-standing research challenge in computer vision. It becomes more eminent in\nvisual-centric autonomous driving systems with multi-view cameras as some\nlong-tail scenarios can never be collected. Guided by the BEV segmentation\nlayouts, the existing generative networks seem to synthesize photo-realistic\nstreet-view images when evaluated solely on scene-level metrics. However, once\nzoom-in, they usually fail to produce accurate foreground and background\ndetails such as heading. To this end, we propose a two-stage generative method,\ndubbed BEVControl, that can generate accurate foreground and background\ncontents. In contrast to segmentation-like input, it also supports sketch style\ninput, which is more flexible for humans to edit. In addition, we propose a\ncomprehensive multi-level evaluation protocol to fairly compare the quality of\nthe generated scene, foreground object, and background geometry. Our extensive\nexperiments show that our BEVControl surpasses the state-of-the-art method,\nBEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation\nmIoU. In addition, we show that using images generated by BEVControl to train\nthe downstream perception model, it achieves on average 1.29 improvement in NDS\nscore.\n","authors":["Kairui Yang","Enhui Ma","Jibin Peng","Qing Guo","Di Lin","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01661v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.02117v1","updated":"2023-08-04T02:58:08Z","published":"2023-08-04T02:58:08Z","title":"VQGraph: Graph Vector-Quantization for Bridging GNNs and MLPs","summary":"  Graph Neural Networks (GNNs) conduct message passing which aggregates local\nneighbors to update node representations. Such message passing leads to\nscalability issues in practical latency-constrained applications. To address\nthis issue, recent methods adopt knowledge distillation (KD) to learn\ncomputationally-efficient multi-layer perceptron (MLP) by mimicking the output\nof GNN. However, the existing GNN representation space may not be expressive\nenough for representing diverse local structures of the underlying graph, which\nlimits the knowledge transfer from GNN to MLP. Here we present a novel\nframework VQGraph to learn a powerful graph representation space for bridging\nGNNs and MLPs. We adopt the encoder of a variant of a vector-quantized\nvariational autoencoder (VQ-VAE) as a structure-aware graph tokenizer, which\nexplicitly represents the nodes of diverse local structures as numerous\ndiscrete tokens and constitutes a meaningful codebook. Equipped with the\nlearned codebook, we propose a new token-based distillation objective based on\nsoft token assignments to sufficiently transfer the structural knowledge from\nGNN to MLP. Extensive experiments and analyses demonstrate the strong\nperformance of VQGraph, where we achieve new state-of-the-art performance on\nGNN-MLP distillation in both transductive and inductive settings across seven\ngraph datasets. We show that VQGraph with better performance infers faster than\nGNNs by 828x, and also achieves accuracy improvement over GNNs and stand-alone\nMLPs by 3.90% and 28.05% on average, respectively. Code:\nhttps://github.com/YangLing0818/VQGraph.\n","authors":["Ling Yang","Ye Tian","Minkai Xu","Zhongyi Liu","Shenda Hong","Wei Qu","Wentao Zhang","Bin Cui","Muhan Zhang","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2308.02117v1.pdf","comment":"arXiv admin note: text overlap with arXiv:1906.00446 by other authors"},{"id":"http://arxiv.org/abs/2308.02116v1","updated":"2023-08-04T02:47:19Z","published":"2023-08-04T02:47:19Z","title":"AdvFAS: A robust face anti-spoofing framework against adversarial\n  examples","summary":"  Ensuring the reliability of face recognition systems against presentation\nattacks necessitates the deployment of face anti-spoofing techniques. Despite\nconsiderable advancements in this domain, the ability of even the most\nstate-of-the-art methods to defend against adversarial examples remains\nelusive. While several adversarial defense strategies have been proposed, they\ntypically suffer from constrained practicability due to inevitable trade-offs\nbetween universality, effectiveness, and efficiency. To overcome these\nchallenges, we thoroughly delve into the coupled relationship between\nadversarial detection and face anti-spoofing. Based on this, we propose a\nrobust face anti-spoofing framework, namely AdvFAS, that leverages two coupled\nscores to accurately distinguish between correctly detected and wrongly\ndetected face images. Extensive experiments demonstrate the effectiveness of\nour framework in a variety of settings, including different attacks, datasets,\nand backbones, meanwhile enjoying high accuracy on clean examples. Moreover, we\nsuccessfully apply the proposed method to detect real-world adversarial\nexamples.\n","authors":["Jiawei Chen","Xiao Yang","Heng Yin","Mingzhi Ma","Bihui Chen","Jianteng Peng","Yandong Guo","Zhaoxia Yin","Hang Su"],"pdf_url":"https://arxiv.org/pdf/2308.02116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09670v2","updated":"2023-08-04T02:42:50Z","published":"2023-04-19T13:58:31Z","title":"CMID: A Unified Self-Supervised Learning Framework for Remote Sensing\n  Image Understanding","summary":"  Self-supervised learning (SSL) has gained widespread attention in the remote\nsensing (RS) and earth observation (EO) communities owing to its ability to\nlearn task-agnostic representations without human-annotated labels.\nNevertheless, most existing RS SSL methods are limited to learning either\nglobal semantic separable or local spatial perceptible representations. We\nargue that this learning strategy is suboptimal in the realm of RS, since the\nrequired representations for different RS downstream tasks are often varied and\ncomplex. In this study, we proposed a unified SSL framework that is better\nsuited for RS images representation learning. The proposed SSL framework,\nContrastive Mask Image Distillation (CMID), is capable of learning\nrepresentations with both global semantic separability and local spatial\nperceptibility by combining contrastive learning (CL) with masked image\nmodeling (MIM) in a self-distillation way. Furthermore, our CMID learning\nframework is architecture-agnostic, which is compatible with both convolutional\nneural networks (CNN) and vision transformers (ViT), allowing CMID to be easily\nadapted to a variety of deep learning (DL) applications for RS understanding.\nComprehensive experiments have been carried out on four downstream tasks (i.e.\nscene classification, semantic segmentation, object-detection, and change\ndetection) and the results show that models pre-trained using CMID achieve\nbetter performance than other state-of-the-art SSL methods on multiple\ndownstream tasks. The code and pre-trained models will be made available at\nhttps://github.com/NJU-LHRS/official-CMID to facilitate SSL research and speed\nup the development of RS images DL applications.\n","authors":["Dilxat Muhtar","Xueliang Zhang","Pengfeng Xiao","Zhenshi Li","Feng Gu"],"pdf_url":"https://arxiv.org/pdf/2304.09670v2.pdf","comment":"Accepted by IEEE TGRS. The codes and models are released at\n  https://github.com/NJU-LHRS/official-CMID"},{"id":"http://arxiv.org/abs/2304.02978v2","updated":"2023-08-04T02:29:24Z","published":"2023-04-06T10:05:54Z","title":"Simplifying Low-Light Image Enhancement Networks with Relative Loss\n  Functions","summary":"  Image enhancement is a common technique used to mitigate issues such as\nsevere noise, low brightness, low contrast, and color deviation in low-light\nimages. However, providing an optimal high-light image as a reference for\nlow-light image enhancement tasks is impossible, which makes the learning\nprocess more difficult than other image processing tasks. As a result, although\nseveral low-light image enhancement methods have been proposed, most of them\nare either too complex or insufficient in addressing all the issues in\nlow-light images. In this paper, to make the learning easier in low-light image\nenhancement, we introduce FLW-Net (Fast and LightWeight Network) and two\nrelative loss functions. Specifically, we first recognize the challenges of the\nneed for a large receptive field to obtain global contrast and the lack of an\nabsolute reference, which limits the simplification of network structures in\nthis task. Then, we propose an efficient global feature information extraction\ncomponent and two loss functions based on relative information to overcome\nthese challenges. Finally, we conducted comparative experiments to demonstrate\nthe effectiveness of the proposed method, and the results confirm that the\nproposed method can significantly reduce the complexity of supervised low-light\nimage enhancement networks while improving processing effect. The code is\navailable at \\url{https://github.com/hitzhangyu/FLW-Net}.\n","authors":["Yu Zhang","Xiaoguang Di","Junde Wu","Rao Fu","Yong Li","Yue Wang","Yanwu Xu","Guohui Yang","Chunhui Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02978v2.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.01867v2","updated":"2023-08-04T02:21:38Z","published":"2023-08-01T08:15:30Z","title":"MRQ:Support Multiple Quantization Schemes through Model Re-Quantization","summary":"  Despite the proliferation of diverse hardware accelerators (e.g., NPU, TPU,\nDPU), deploying deep learning models on edge devices with fixed-point hardware\nis still challenging due to complex model quantization and conversion. Existing\nmodel quantization frameworks like Tensorflow QAT [1], TFLite PTQ [2], and\nQualcomm AIMET [3] supports only a limited set of quantization schemes (e.g.,\nonly asymmetric per-tensor quantization in TF1.x QAT [4]). Accordingly, deep\nlearning models cannot be easily quantized for diverse fixed-point hardwares,\nmainly due to slightly different quantization requirements. In this paper, we\nenvision a new type of model quantization approach called MRQ (model\nre-quantization), which takes existing quantized models and quickly transforms\nthe models to meet different quantization requirements (e.g., asymmetric ->\nsymmetric, non-power-of-2 scale -> power-of-2 scale). Re-quantization is much\nsimpler than quantizing from scratch because it avoids costly re-training and\nprovides support for multiple quantization schemes simultaneously. To minimize\nre-quantization error, we developed a new set of re-quantization algorithms\nincluding weight correction and rounding error folding. We have demonstrated\nthat MobileNetV2 QAT model [7] can be quickly re-quantized into two different\nquantization schemes (i.e., symmetric and symmetric+power-of-2 scale) with less\nthan 0.64 units of accuracy loss. We believe our work is the first to leverage\nthis concept of re-quantization for model quantization and models obtained from\nthe re-quantization process have been successfully deployed on NNA in the Echo\nShow devices.\n","authors":["Manasa Manohara","Sankalp Dayal","Tariq Afzal","Rahul Bakshi","Kahkuen Fu"],"pdf_url":"https://arxiv.org/pdf/2308.01867v2.pdf","comment":"8 pages, 6 figures, 3 tables, TinyML Conference"},{"id":"http://arxiv.org/abs/2306.00393v2","updated":"2023-08-04T01:28:01Z","published":"2023-06-01T06:54:56Z","title":"Teacher Agent: A Knowledge Distillation-Free Framework for\n  Rehearsal-based Video Incremental Learning","summary":"  Rehearsal-based video incremental learning often employs knowledge\ndistillation to mitigate catastrophic forgetting of previously learned data.\nHowever, this method faces two major challenges for video task: substantial\ncomputing resources from loading teacher model and limited replay capability\nfrom performance-limited teacher model. To address these problems, we first\npropose a knowledge distillation-free framework for rehearsal-based video\nincremental learning called \\textit{Teacher Agent}. Instead of loading\nparameter-heavy teacher networks, we introduce an agent generator that is\neither parameter-free or uses only a few parameters to obtain accurate and\nreliable soft labels. This method not only greatly reduces the computing\nrequirement but also circumvents the problem of knowledge misleading caused by\ninaccurate predictions of the teacher model. Moreover, we put forward a\nself-correction loss which provides an effective regularization signal for the\nreview of old knowledge, which in turn alleviates the problem of catastrophic\nforgetting. Further, to ensure that the samples in the memory buffer are\nmemory-efficient and representative, we introduce a unified sampler for\nrehearsal-based video incremental learning to mine fixed-length key video\nframes. Interestingly, based on the proposed strategies, the network exhibits a\nhigh level of robustness against spatial resolution reduction when compared to\nthe baseline. Extensive experiments demonstrate the advantages of our method,\nyielding significant performance improvements while utilizing only half the\nspatial resolution of video clips as network inputs in the incremental phases.\n","authors":["Shengqin Jiang","Yaoyu Fang","Haokui Zhang","Qingshan Liu","Yuankai Qi","Yang Yang","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2306.00393v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.02101v1","updated":"2023-08-04T01:19:32Z","published":"2023-08-04T01:19:32Z","title":"Breast Ultrasound Tumor Classification Using a Hybrid Multitask\n  CNN-Transformer Network","summary":"  Capturing global contextual information plays a critical role in breast\nultrasound (BUS) image classification. Although convolutional neural networks\n(CNNs) have demonstrated reliable performance in tumor classification, they\nhave inherent limitations for modeling global and long-range dependencies due\nto the localized nature of convolution operations. Vision Transformers have an\nimproved capability of capturing global contextual information but may distort\nthe local image patterns due to the tokenization operations. In this study, we\nproposed a hybrid multitask deep neural network called Hybrid-MT-ESTAN,\ndesigned to perform BUS tumor classification and segmentation using a hybrid\narchitecture composed of CNNs and Swin Transformer components. The proposed\napproach was compared to nine BUS classification methods and evaluated using\nseven quantitative metrics on a dataset of 3,320 BUS images. The results\nindicate that Hybrid-MT-ESTAN achieved the highest accuracy, sensitivity, and\nF1 score of 82.7%, 86.4%, and 86.0%, respectively.\n","authors":["Bryar Shareef","Min Xian","Aleksandar Vakanski","Haotian Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02101v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.02100v1","updated":"2023-08-04T01:17:57Z","published":"2023-08-04T01:17:57Z","title":"CT Reconstruction from Few Planar X-rays with Application towards\n  Low-resource Radiotherapy","summary":"  CT scans are the standard-of-care for many clinical ailments, and are needed\nfor treatments like external beam radiotherapy. Unfortunately, CT scanners are\nrare in low and mid-resource settings due to their costs. Planar X-ray\nradiography units, in comparison, are far more prevalent, but can only provide\nlimited 2D observations of the 3D anatomy. In this work, we propose a method to\ngenerate CT volumes from few (<5) planar X-ray observations using a prior data\ndistribution, and perform the first evaluation of such a reconstruction\nalgorithm for a clinical application: radiotherapy planning. We propose a deep\ngenerative model, building on advances in neural implicit representations to\nsynthesize volumetric CT scans from few input planar X-ray images at different\nangles. To focus the generation task on clinically-relevant features, our model\ncan also leverage anatomical guidance during training (via segmentation masks).\nWe generated 2-field opposed, palliative radiotherapy plans on thoracic CTs\nreconstructed by our method, and found that isocenter radiation dose on\nreconstructed scans have <1% error with respect to the dose calculated on\nclinically acquired CTs using <=4 X-ray views. In addition, our method is\nbetter than recent sparse CT reconstruction baselines in terms of standard\npixel and structure-level metrics (PSNR, SSIM, Dice score) on the public LIDC\nlung CT dataset. Code is available at: https://github.com/wanderinrain/Xray2CT.\n","authors":["Yiran Sun","Tucker Netherton","Laurence Court","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2308.02100v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.02097v1","updated":"2023-08-04T01:03:58Z","published":"2023-08-04T01:03:58Z","title":"Multi-interactive Feature Learning and a Full-time Multi-modality\n  Benchmark for Image Fusion and Segmentation","summary":"  Multi-modality image fusion and segmentation play a vital role in autonomous\ndriving and robotic operation. Early efforts focus on boosting the performance\nfor only one task, \\emph{e.g.,} fusion or segmentation, making it hard to\nreach~`Best of Both Worlds'. To overcome this issue, in this paper, we propose\na \\textbf{M}ulti-\\textbf{i}nteractive \\textbf{F}eature learning architecture\nfor image fusion and \\textbf{Seg}mentation, namely SegMiF, and exploit\ndual-task correlation to promote the performance of both tasks. The SegMiF is\nof a cascade structure, containing a fusion sub-network and a commonly used\nsegmentation sub-network. By slickly bridging intermediate features between two\ncomponents, the knowledge learned from the segmentation task can effectively\nassist the fusion task. Also, the benefited fusion network supports the\nsegmentation one to perform more pretentiously. Besides, a hierarchical\ninteractive attention block is established to ensure fine-grained mapping of\nall the vital information between two tasks, so that the modality/semantic\nfeatures can be fully mutual-interactive. In addition, a dynamic weight factor\nis introduced to automatically adjust the corresponding weights of each task,\nwhich can balance the interactive feature correspondence and break through the\nlimitation of laborious tuning. Furthermore, we construct a smart multi-wave\nbinocular imaging system and collect a full-time multi-modality benchmark with\n15 annotated pixel-level categories for image fusion and segmentation.\nExtensive experiments on several public datasets and our benchmark demonstrate\nthat the proposed method outputs visually appealing fused images and perform\naveragely $7.66\\%$ higher segmentation mIoU in the real-world scene than the\nstate-of-the-art approaches. The source code and benchmark are available at\n\\url{https://github.com/JinyuanLiu-CV/SegMiF}.\n","authors":["Jinyuan Liu","Zhu Liu","Guanyao Wu","Long Ma","Risheng Liu","Wei Zhong","Zhongxuan Luo","Xin Fan"],"pdf_url":"https://arxiv.org/pdf/2308.02097v1.pdf","comment":"Accepted by ICCV 2023. The source code and benchmark are available at\n  https://github.com/JinyuanLiu-CV/SegMiF"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.02442v1","updated":"2023-08-04T16:14:43Z","published":"2023-08-04T16:14:43Z","title":"Adaptive Preferential Attached kNN Graph With Distribution-Awareness","summary":"  Graph-based kNN algorithms have garnered widespread popularity for machine\nlearning tasks, due to their simplicity and effectiveness. However, the\nconventional kNN graph's reliance on a fixed value of k can hinder its\nperformance, especially in scenarios involving complex data distributions.\nMoreover, like other classification models, the presence of ambiguous samples\nalong decision boundaries often presents a challenge, as they are more prone to\nincorrect classification. To address these issues, we propose the Preferential\nAttached k-Nearest Neighbors Graph (paNNG), which combines adaptive kNN with\ndistribution-based graph construction. By incorporating distribution\ninformation, paNNG can significantly improve performance for ambiguous samples\nby \"pulling\" them towards their original classes and hence enable enhanced\noverall accuracy and generalization capability. Through rigorous evaluations on\ndiverse benchmark datasets, paNNG outperforms state-of-the-art algorithms,\nshowcasing its adaptability and efficacy across various real-world scenarios.\n","authors":["Shaojie Min","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15595v2","updated":"2023-08-04T14:42:02Z","published":"2023-03-27T20:54:49Z","title":"Bi-Encoder Cascades for Efficient Image Search","summary":"  Modern neural encoders offer unprecedented text-image retrieval (TIR)\naccuracy, but their high computational cost impedes an adoption to large-scale\nimage searches. To lower this cost, model cascades use an expensive encoder to\nrefine the ranking of a cheap encoder. However, existing cascading algorithms\nfocus on cross-encoders, which jointly process text-image pairs, but do not\nconsider cascades of bi-encoders, which separately process texts and images. We\nintroduce the small-world search scenario as a realistic setting where\nbi-encoder cascades can reduce costs. We then propose a cascading algorithm\nthat leverages the small-world search scenario to reduce lifetime image\nencoding costs of a TIR system. Our experiments show cost reductions by up to\n6x.\n","authors":["Robert Hönig","Jan Ackermann","Mingyuan Chi"],"pdf_url":"https://arxiv.org/pdf/2303.15595v2.pdf","comment":"Under review as a short paper at the ICCV '23 RCV workshop"},{"id":"http://arxiv.org/abs/2308.02335v1","updated":"2023-08-04T14:06:44Z","published":"2023-08-04T14:06:44Z","title":"RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph\n  Classification","summary":"  Graph classification is a crucial task in many real-world multimedia\napplications, where graphs can represent various multimedia data types such as\nimages, videos, and social networks. Previous efforts have applied graph neural\nnetworks (GNNs) in balanced situations where the class distribution is\nbalanced. However, real-world data typically exhibit long-tailed class\ndistributions, resulting in a bias towards the head classes when using GNNs and\nlimited generalization ability over the tail classes. Recent approaches mainly\nfocus on re-balancing different classes during model training, which fails to\nexplicitly introduce new knowledge and sacrifices the performance of the head\nclasses. To address these drawbacks, we propose a novel framework called\nRetrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature\nextractor and an unbiased classifier in a decoupled manner. In the feature\nextractor training stage, we develop a graph retrieval module to search for\nrelevant graphs that directly enrich the intra-class diversity for the tail\nclasses. Moreover, we innovatively optimize a category-centered supervised\ncontrastive loss to obtain discriminative representations, which is more\nsuitable for long-tailed scenarios. In the classifier fine-tuning stage, we\nbalance the classifier weights with two weight regularization techniques, i.e.,\nMax-norm and weight decay. Experiments on various popular benchmarks verify the\nsuperiority of the proposed method against state-of-the-art approaches.\n","authors":["Zhengyang Mao","Wei Ju","Yifang Qin","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02335v1.pdf","comment":"Accepted by the ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2308.02294v1","updated":"2023-08-04T12:59:39Z","published":"2023-08-04T12:59:39Z","title":"Learning to Select the Relevant History Turns in Conversational Question\n  Answering","summary":"  The increasing demand for the web-based digital assistants has given a rapid\nrise in the interest of the Information Retrieval (IR) community towards the\nfield of conversational question answering (ConvQA). However, one of the\ncritical aspects of ConvQA is the effective selection of conversational history\nturns to answer the question at hand. The dependency between relevant history\nselection and correct answer prediction is an intriguing but under-explored\narea. The selected relevant context can better guide the system so as to where\nexactly in the passage to look for an answer. Irrelevant context, on the other\nhand, brings noise to the system, thereby resulting in a decline in the model's\nperformance. In this paper, we propose a framework, DHS-ConvQA (Dynamic History\nSelection in Conversational Question Answering), that first generates the\ncontext and question entities for all the history turns, which are then pruned\non the basis of similarity they share in common with the question at hand. We\nalso propose an attention-based mechanism to re-rank the pruned terms based on\ntheir calculated weights of how useful they are in answering the question. In\nthe end, we further aid the model by highlighting the terms in the re-ranked\nconversational history using a binary classification task and keeping the\nuseful terms (predicted as 1) and ignoring the irrelevant terms (predicted as\n0). We demonstrate the efficacy of our proposed framework with extensive\nexperimental results on CANARD and QuAC -- the two popularly utilized datasets\nin ConvQA. We demonstrate that selecting relevant turns works better than\nrewriting the original question. We also investigate how adding the irrelevant\nhistory turns negatively impacts the model's performance and discuss the\nresearch challenges that demand more attention from the IR community.\n","authors":["Munazza Zaib","Wei Emma Zhang","Quan Z. Sheng","Subhash Sagar","Adnan Mahmood","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02269v1","updated":"2023-08-04T11:46:12Z","published":"2023-08-04T11:46:12Z","title":"Optimally Computing Compressed Indexing Arrays Based on the Compact\n  Directed Acyclic Word Graph","summary":"  In this paper, we present the first study of the computational complexity of\nconverting an automata-based text index structure, called the Compact Directed\nAcyclic Word Graph (CDAWG), of size $e$ for a text $T$ of length $n$ into other\ntext indexing structures for the same text, suitable for highly repetitive\ntexts: the run-length BWT of size $r$, the irreducible PLCP array of size $r$,\nand the quasi-irreducible LPF array of size $e$, as well as the lex-parse of\nsize $O(r)$ and the LZ77-parse of size $z$, where $r, z \\le e$. As main\nresults, we showed that the above structures can be optimally computed from\neither the CDAWG for $T$ stored in read-only memory or its self-index version\nof size $e$ without a text in $O(e)$ worst-case time and words of working\nspace. To obtain the above results, we devised techniques for enumerating a\nparticular subset of suffixes in the lexicographic and text orders using the\nforward and backward search on the CDAWG by extending the results by\nBelazzougui et al. in 2015.\n","authors":["Hiroki Arimura","Shunsuke Inenaga","Yasuaki Kobayashi","Yuto Nakashima","Mizuki Sue"],"pdf_url":"https://arxiv.org/pdf/2308.02269v1.pdf","comment":"The short version of this paper will appear in SPIRE 2023, Pisa,\n  Italy, September 26-28, 2023, Lecture Notes in Computer Science, Springer"},{"id":"http://arxiv.org/abs/2308.02249v1","updated":"2023-08-04T11:13:15Z","published":"2023-08-04T11:13:15Z","title":"Finding Tori: Self-supervised Learning for Analyzing Korean Folk Song","summary":"  In this paper, we introduce a computational analysis of the field recording\ndataset of approximately 700 hours of Korean folk songs, which were recorded\naround 1980-90s. Because most of the songs were sung by non-expert musicians\nwithout accompaniment, the dataset provides several challenges. To address this\nchallenge, we utilized self-supervised learning with convolutional neural\nnetwork based on pitch contour, then analyzed how the musical concept of tori,\na classification system defined by a specific scale, ornamental notes, and an\nidiomatic melodic contour, is captured by the model. The experimental result\nshows that our approach can better capture the characteristics of tori compared\nto traditional pitch histograms. Using our approaches, we have examined how\nmusical discussions proposed in existing academia manifest in the actual field\nrecordings of Korean folk songs.\n","authors":["Danbinaerin Han","Rafael Caro Repetto","Dasaem Jeong"],"pdf_url":"https://arxiv.org/pdf/2308.02249v1.pdf","comment":"Accepted at 24th International Society for Music Information\n  Retrieval Conference (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2308.02205v1","updated":"2023-08-04T08:45:02Z","published":"2023-08-04T08:45:02Z","title":"Towards Personalized Prompt-Model Retrieval for Generative\n  Recommendation","summary":"  Recommender Systems are built to retrieve relevant items to satisfy users'\ninformation needs. The candidate corpus usually consists of a finite set of\nitems that are ready to be served, such as videos, products, or articles. With\nrecent advances in Generative AI such as GPT and Diffusion models, a new form\nof recommendation task is yet to be explored where items are to be created by\ngenerative models with personalized prompts. Taking image generation as an\nexample, with a single prompt from the user and access to a generative model,\nit is possible to generate hundreds of new images in a few minutes. How shall\nwe attain personalization in the presence of \"infinite\" items? In this\npreliminary study, we propose a two-stage framework, namely Prompt-Model\nRetrieval and Generated Item Ranking, to approach this new task formulation. We\nrelease GEMRec-18K, a prompt-model interaction dataset with 18K images\ngenerated by 200 publicly-available generative models paired with a diverse set\nof 90 textual prompts. Our findings demonstrate the promise of generative model\nrecommendation as a novel personalization problem and the limitations of\nexisting evaluation metrics. We highlight future directions for the RecSys\ncommunity to advance towards generative recommender systems. Our code and\ndataset are available at https://github.com/MAPS-research/GEMRec.\n","authors":["Yuanhe Guo","Haoming Liu","Hongyi Wen"],"pdf_url":"https://arxiv.org/pdf/2308.02205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15464v2","updated":"2023-08-04T06:12:11Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":"  Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2305.19860v3","updated":"2023-08-04T02:58:15Z","published":"2023-05-31T13:51:26Z","title":"A Survey on Large Language Models for Recommendation","summary":"  Large Language Models (LLMs) have emerged as powerful tools in the field of\nNatural Language Processing (NLP) and have recently gained significant\nattention in the domain of Recommendation Systems (RS). These models, trained\non massive amounts of data using self-supervised learning, have demonstrated\nremarkable success in learning universal representations and have the potential\nto enhance various aspects of recommendation systems by some effective transfer\ntechniques such as fine-tuning and prompt tuning, and so on. The crucial aspect\nof harnessing the power of language models in enhancing recommendation quality\nis the utilization of their high-quality representations of textual features\nand their extensive coverage of external knowledge to establish correlations\nbetween items and users. To provide a comprehensive understanding of the\nexisting LLM-based recommendation systems, this survey presents a taxonomy that\ncategorizes these models into two major paradigms, respectively Discriminative\nLLM for Recommendation (DLLM4Rec) and Generative LLM for Recommendation\n(GLLM4Rec), with the latter being systematically sorted out for the first time.\nFurthermore, we systematically review and analyze existing LLM-based\nrecommendation systems within each paradigm, providing insights into their\nmethodologies, techniques, and performance. Additionally, we identify key\nchallenges and several valuable findings to provide researchers and\npractitioners with inspiration. We have also created a GitHub repository to\nindex relevant papers on LLMs for recommendation,\nhttps://github.com/WLiK/LLM4Rec.\n","authors":["Likang Wu","Zhi Zheng","Zhaopeng Qiu","Hao Wang","Hongchao Gu","Tingjia Shen","Chuan Qin","Chen Zhu","Hengshu Zhu","Qi Liu","Hui Xiong","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19860v3.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.02622v1","updated":"2023-08-04T15:14:16Z","published":"2023-08-04T15:14:16Z","title":"Harnessing the Web and Knowledge Graphs for Automated Impact Investing\n  Scoring","summary":"  The Sustainable Development Goals (SDGs) were introduced by the United\nNations in order to encourage policies and activities that help guarantee human\nprosperity and sustainability. SDG frameworks produced in the finance industry\nare designed to provide scores that indicate how well a company aligns with\neach of the 17 SDGs. This scoring enables a consistent assessment of\ninvestments that have the potential of building an inclusive and sustainable\neconomy. As a result of the high quality and reliability required by such\nframeworks, the process of creating and maintaining them is time-consuming and\nrequires extensive domain expertise. In this work, we describe a data-driven\nsystem that seeks to automate the process of creating an SDG framework. First,\nwe propose a novel method for collecting and filtering a dataset of texts from\ndifferent web sources and a knowledge graph relevant to a set of companies. We\nthen implement and deploy classifiers trained with this data for predicting\nscores of alignment with SDGs for a given company. Our results indicate that\nour best performing model can accurately predict SDG scores with a micro\naverage F1 score of 0.89, demonstrating the effectiveness of the proposed\nsolution. We further describe how the integration of the models for its use by\nhumans can be facilitated by providing explanations in the form of data\nrelevant to a predicted score. We find that our proposed solution enables\naccess to a large amount of information that analysts would normally not be\nable to process, resulting in an accurate prediction of SDG scores at a\nfraction of the cost.\n","authors":["Qingzhi Hu","Daniel Daza","Laurens Swinkels","Kristina Ūsaitė","Robbert-Jan 't Hoen","Paul Groth"],"pdf_url":"https://arxiv.org/pdf/2308.02622v1.pdf","comment":"Presented at the KDD 2023 Workshop - Fragile Earth: AI for Climate\n  Sustainability"},{"id":"http://arxiv.org/abs/2308.02618v1","updated":"2023-08-04T14:50:37Z","published":"2023-08-04T14:50:37Z","title":"ChatGPT for GTFS: From Words to Information","summary":"  The General Transit Feed Specification (GTFS) standard for publishing transit\ndata is ubiquitous. GTFS being tabular data, with information spread across\ndifferent files, necessitates specialized tools or packages to retrieve\ninformation. Concurrently, the use of Large Language Models for text and\ninformation retrieval is growing. The idea of this research is to see if the\ncurrent widely adopted LLMs (ChatGPT) are able to retrieve information from\nGTFS using natural language instructions. We first test whether ChatGPT\n(GPT-3.5) understands the GTFS specification. GPT-3.5 answers 77% of our\nmultiple-choice questions (MCQ) correctly. Next, we task the LLM with\ninformation extractions from a filtered GTFS feed with 4 routes. For\ninformation retrieval, we compare zero-shot and program synthesis. Program\nsynthesis works better, achieving ~90% accuracy on simple questions and ~40%\naccuracy on complex questions.\n","authors":["Saipraneeth Devunuri","Shirin Qiam","Lewis Lehe"],"pdf_url":"https://arxiv.org/pdf/2308.02618v1.pdf","comment":"18 pages, 7 figures, 1 table, Transportation Research Board"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.02490v1","updated":"2023-08-04T17:59:47Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":"  We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models. Code and data are\navailable at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v1.pdf","comment":"Code and data: https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2206.09919v2","updated":"2023-08-04T17:54:39Z","published":"2022-06-20T17:58:19Z","title":"Inference-Based Quantum Sensing","summary":"  In a standard Quantum Sensing (QS) task one aims at estimating an unknown\nparameter $\\theta$, encoded into an $n$-qubit probe state, via measurements of\nthe system. The success of this task hinges on the ability to correlate changes\nin the parameter to changes in the system response $\\mathcal{R}(\\theta)$ (i.e.,\nchanges in the measurement outcomes). For simple cases the form of\n$\\mathcal{R}(\\theta)$ is known, but the same cannot be said for realistic\nscenarios, as no general closed-form expression exists. In this work we present\nan inference-based scheme for QS. We show that, for a general class of unitary\nfamilies of encoding, $\\mathcal{R}(\\theta)$ can be fully characterized by only\nmeasuring the system response at $2n+1$ parameters. This allows us to infer the\nvalue of an unknown parameter given the measured response, as well as to\ndetermine the sensitivity of the scheme, which characterizes its overall\nperformance. We show that inference error is, with high probability, smaller\nthan $\\delta$, if one measures the system response with a number of shots that\nscales only as $\\Omega(\\log^3(n)/\\delta^2)$. Furthermore, the framework\npresented can be broadly applied as it remains valid for arbitrary probe states\nand measurement schemes, and, even holds in the presence of quantum noise. We\nalso discuss how to extend our results beyond unitary families. Finally, to\nshowcase our method we implement it for a QS task on real quantum hardware, and\nin numerical simulations.\n","authors":["C. Huerta Alderete","Max Hunter Gordon","Frederic Sauvage","Akira Sone","Andrew T. Sornborger","Patrick J. Coles","M. Cerezo"],"pdf_url":"https://arxiv.org/pdf/2206.09919v2.pdf","comment":"7+13 pages, 3+7 figures"},{"id":"http://arxiv.org/abs/2212.12794v2","updated":"2023-08-04T17:07:43Z","published":"2022-12-24T18:15:39Z","title":"GraphCast: Learning skillful medium-range global weather forecasting","summary":"  Global medium-range weather forecasting is critical to decision-making across\nmany social and economic domains. Traditional numerical weather prediction uses\nincreased compute resources to improve forecast accuracy, but cannot directly\nuse historical weather data to improve the underlying model. We introduce a\nmachine learning-based method called \"GraphCast\", which can be trained directly\nfrom reanalysis data. It predicts hundreds of weather variables, over 10 days\nat 0.25 degree resolution globally, in under one minute. We show that GraphCast\nsignificantly outperforms the most accurate operational deterministic systems\non 90% of 1380 verification targets, and its forecasts support better severe\nevent prediction, including tropical cyclones, atmospheric rivers, and extreme\ntemperatures. GraphCast is a key advance in accurate and efficient weather\nforecasting, and helps realize the promise of machine learning for modeling\ncomplex dynamical systems.\n","authors":["Remi Lam","Alvaro Sanchez-Gonzalez","Matthew Willson","Peter Wirnsberger","Meire Fortunato","Ferran Alet","Suman Ravuri","Timo Ewalds","Zach Eaton-Rosen","Weihua Hu","Alexander Merose","Stephan Hoyer","George Holland","Oriol Vinyals","Jacklynn Stott","Alexander Pritzel","Shakir Mohamed","Peter Battaglia"],"pdf_url":"https://arxiv.org/pdf/2212.12794v2.pdf","comment":"GraphCast code and trained weights are available at:\n  https://github.com/deepmind/graphcast"},{"id":"http://arxiv.org/abs/2308.02465v1","updated":"2023-08-04T17:04:58Z","published":"2023-08-04T17:04:58Z","title":"BlindSage: Label Inference Attacks against Node-level Vertical Federated\n  Graph Neural Networks","summary":"  Federated learning enables collaborative training of machine learning models\nby keeping the raw data of the involved workers private. One of its main\nobjectives is to improve the models' privacy, security, and scalability.\nVertical Federated Learning (VFL) offers an efficient cross-silo setting where\na few parties collaboratively train a model without sharing the same features.\nIn such a scenario, classification labels are commonly considered sensitive\ninformation held exclusively by one (active) party, while other (passive)\nparties use only their local information. Recent works have uncovered important\nflaws of VFL, leading to possible label inference attacks under the assumption\nthat the attacker has some, even limited, background knowledge on the relation\nbetween labels and data. In this work, we are the first (to the best of our\nknowledge) to investigate label inference attacks on VFL using a\nzero-background knowledge strategy. To concretely formulate our proposal, we\nfocus on Graph Neural Networks (GNNs) as a target model for the underlying VFL.\nIn particular, we refer to node classification tasks, which are widely studied,\nand GNNs have shown promising results. Our proposed attack, BlindSage, provides\nimpressive results in the experiments, achieving nearly 100% accuracy in most\ncases. Even when the attacker has no information about the used architecture or\nthe number of classes, the accuracy remained above 85% in most instances.\nFinally, we observe that well-known defenses cannot mitigate our attack without\naffecting the model's performance on the main classification task.\n","authors":["Marco Arazzi","Mauro Conti","Stefanos Koffas","Marina Krcek","Antonino Nocera","Stjepan Picek","Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2308.02465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02464v1","updated":"2023-08-04T17:04:13Z","published":"2023-08-04T17:04:13Z","title":"Universal Approximation of Linear Time-Invariant (LTI) Systems through\n  RNNs: Power of Randomness in Reservoir Computing","summary":"  Recurrent neural networks (RNNs) are known to be universal approximators of\ndynamic systems under fairly mild and general assumptions, making them good\ntools to process temporal information. However, RNNs usually suffer from the\nissues of vanishing and exploding gradients in the standard RNN training.\nReservoir computing (RC), a special RNN where the recurrent weights are\nrandomized and left untrained, has been introduced to overcome these issues and\nhas demonstrated superior empirical performance in fields as diverse as natural\nlanguage processing and wireless communications especially in scenarios where\ntraining samples are extremely limited. On the contrary, the theoretical\ngrounding to support this observed performance has not been fully developed at\nthe same pace. In this work, we show that RNNs can provide universal\napproximation of linear time-invariant (LTI) systems. Specifically, we show\nthat RC can universally approximate a general LTI system. We present a clear\nsignal processing interpretation of RC and utilize this understanding in the\nproblem of simulating a generic LTI system through RC. Under this setup, we\nanalytically characterize the optimal probability distribution function for\ngenerating the recurrent weights of the underlying RNN of the RC. We provide\nextensive numerical evaluations to validate the optimality of the derived\noptimum distribution of the recurrent weights of the RC for the LTI system\nsimulation problem. Our work results in clear signal processing-based model\ninterpretability of RC and provides theoretical explanation for the power of\nrandomness in setting instead of training RC's recurrent weights. It further\nprovides a complete optimum analytical characterization for the untrained\nrecurrent weights, marking an important step towards explainable machine\nlearning (XML) which is extremely important for applications where training\nsamples are limited.\n","authors":["Shashank Jere","Lizhong Zheng","Karim Said","Lingjia Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02464v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2308.02462v1","updated":"2023-08-04T17:00:34Z","published":"2023-08-04T17:00:34Z","title":"Fast and Accurate Reduced-Order Modeling of a MOOSE-based Additive\n  Manufacturing Model with Operator Learning","summary":"  One predominant challenge in additive manufacturing (AM) is to achieve\nspecific material properties by manipulating manufacturing process parameters\nduring the runtime. Such manipulation tends to increase the computational load\nimposed on existing simulation tools employed in AM. The goal of the present\nwork is to construct a fast and accurate reduced-order model (ROM) for an AM\nmodel developed within the Multiphysics Object-Oriented Simulation Environment\n(MOOSE) framework, ultimately reducing the time/cost of AM control and\noptimization processes. Our adoption of the operator learning (OL) approach\nenabled us to learn a family of differential equations produced by altering\nprocess variables in the laser's Gaussian point heat source. More specifically,\nwe used the Fourier neural operator (FNO) and deep operator network (DeepONet)\nto develop ROMs for time-dependent responses. Furthermore, we benchmarked the\nperformance of these OL methods against a conventional deep neural network\n(DNN)-based ROM. Ultimately, we found that OL methods offer comparable\nperformance and, in terms of accuracy and generalizability, even outperform DNN\nat predicting scalar model responses. The DNN-based ROM afforded the fastest\ntraining time. Furthermore, all the ROMs were faster than the original MOOSE\nmodel yet still provided accurate predictions. FNO had a smaller mean\nprediction error than DeepONet, with a larger variance for time-dependent\nresponses. Unlike DNN, both FNO and DeepONet were able to simulate time series\ndata without the need for dimensionality reduction techniques. The present work\ncan help facilitate the AM optimization process by enabling faster execution of\nsimulation tools while still preserving evaluation accuracy.\n","authors":["Mahmoud Yaseen","Dewen Yushu","Peter German","Xu Wu"],"pdf_url":"https://arxiv.org/pdf/2308.02462v1.pdf","comment":"28 pages, 18 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.02459v1","updated":"2023-08-04T16:55:00Z","published":"2023-08-04T16:55:00Z","title":"Nonprehensile Planar Manipulation through Reinforcement Learning with\n  Multimodal Categorical Exploration","summary":"  Developing robot controllers capable of achieving dexterous nonprehensile\nmanipulation, such as pushing an object on a table, is challenging. The\nunderactuated and hybrid-dynamics nature of the problem, further complicated by\nthe uncertainty resulting from the frictional interactions, requires\nsophisticated control behaviors. Reinforcement Learning (RL) is a powerful\nframework for developing such robot controllers. However, previous RL\nliterature addressing the nonprehensile pushing task achieves low accuracy,\nnon-smooth trajectories, and only simple motions, i.e. without rotation of the\nmanipulated object. We conjecture that previously used unimodal exploration\nstrategies fail to capture the inherent hybrid-dynamics of the task, arising\nfrom the different possible contact interaction modes between the robot and the\nobject, such as sticking, sliding, and separation. In this work, we propose a\nmultimodal exploration approach through categorical distributions, which\nenables us to train planar pushing RL policies for arbitrary starting and\ntarget object poses, i.e. positions and orientations, and with improved\naccuracy. We show that the learned policies are robust to external disturbances\nand observation noise, and scale to tasks with multiple pushers. Furthermore,\nwe validate the transferability of the learned policies, trained entirely in\nsimulation, to a physical robot hardware using the KUKA iiwa robot arm. See our\nsupplemental video: https://youtu.be/vTdva1mgrk4.\n","authors":["Juan Del Aguila Ferrandis","João Moura","Sethu Vijayakumar"],"pdf_url":"https://arxiv.org/pdf/2308.02459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02715v3","updated":"2023-08-04T16:44:56Z","published":"2023-06-05T09:08:24Z","title":"Federated Deep Learning for Intrusion Detection in IoT Networks","summary":"  The vast increase of Internet of Things (IoT) technologies and the\never-evolving attack vectors have increased cyber-security risks dramatically.\nA common approach to implementing AI-based Intrusion Detection systems (IDSs)\nin distributed IoT systems is in a centralised manner. However, this approach\nmay violate data privacy and prohibit IDS scalability. Therefore, intrusion\ndetection solutions in IoT ecosystems need to move towards a decentralised\ndirection. Federated Learning (FL) has attracted significant interest in recent\nyears due to its ability to perform collaborative learning while preserving\ndata confidentiality and locality. Nevertheless, most FL-based IDS for IoT\nsystems are designed under unrealistic data distribution conditions. To that\nend, we design an experiment representative of the real world and evaluate the\nperformance of an FL-based IDS. For our experiments, we rely on TON-IoT, a\nrealistic IoT network traffic dataset, associating each IP address with a\nsingle FL client. Additionally, we explore pre-training and investigate various\naggregation methods to mitigate the impact of data heterogeneity. Lastly, we\nbenchmark our approach against a centralised solution. The comparison shows\nthat the heterogeneous nature of the data has a considerable negative impact on\nthe model's performance when trained in a distributed manner. However, in the\ncase of a pre-trained initial global FL model, we demonstrate a performance\nimprovement of over 20% (F1-score) compared to a randomly initiated global\nmodel.\n","authors":["Othmane Belarbi","Theodoros Spyridopoulos","Eirini Anthi","Ioannis Mavromatis","Pietro Carnelli","Aftab Khan"],"pdf_url":"https://arxiv.org/pdf/2306.02715v3.pdf","comment":"6 pages, 4 figues, 3 tables. To be presented at the IEEE Global\n  Communications Conference in December 2023"},{"id":"http://arxiv.org/abs/2308.02452v1","updated":"2023-08-04T16:38:37Z","published":"2023-08-04T16:38:37Z","title":"Generative Modelling of Lévy Area for High Order SDE Simulation","summary":"  It is well known that, when numerically simulating solutions to SDEs,\nachieving a strong convergence rate better than O(\\sqrt{h}) (where h is the\nstep size) requires the use of certain iterated integrals of Brownian motion,\ncommonly referred to as its \"L\\'{e}vy areas\". However, these stochastic\nintegrals are difficult to simulate due to their non-Gaussian nature and for a\nd-dimensional Brownian motion with d > 2, no fast almost-exact sampling\nalgorithm is known.\n  In this paper, we propose L\\'{e}vyGAN, a deep-learning-based model for\ngenerating approximate samples of L\\'{e}vy area conditional on a Brownian\nincrement. Due to our \"Bridge-flipping\" operation, the output samples match all\njoint and conditional odd moments exactly. Our generator employs a tailored\nGNN-inspired architecture, which enforces the correct dependency structure\nbetween the output distribution and the conditioning variable. Furthermore, we\nincorporate a mathematically principled characteristic-function based\ndiscriminator. Lastly, we introduce a novel training mechanism termed\n\"Chen-training\", which circumvents the need for expensive-to-generate training\ndata-sets. This new training procedure is underpinned by our two main\ntheoretical results.\n  For 4-dimensional Brownian motion, we show that L\\'{e}vyGAN exhibits\nstate-of-the-art performance across several metrics which measure both the\njoint and marginal distributions. We conclude with a numerical experiment on\nthe log-Heston model, a popular SDE in mathematical finance, demonstrating that\nhigh-quality synthetic L\\'{e}vy area can lead to high order weak convergence\nand variance reduction when using multilevel Monte Carlo (MLMC).\n","authors":["Andraž Jelinčič","Jiajie Tao","William F. Turner","Thomas Cass","James Foster","Hao Ni"],"pdf_url":"https://arxiv.org/pdf/2308.02452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02451v1","updated":"2023-08-04T16:34:06Z","published":"2023-08-04T16:34:06Z","title":"Pruning a neural network using Bayesian inference","summary":"  Neural network pruning is a highly effective technique aimed at reducing the\ncomputational and memory demands of large neural networks. In this research\npaper, we present a novel approach to pruning neural networks utilizing\nBayesian inference, which can seamlessly integrate into the training procedure.\nOur proposed method leverages the posterior probabilities of the neural network\nprior to and following pruning, enabling the calculation of Bayes factors. The\ncalculated Bayes factors guide the iterative pruning. Through comprehensive\nevaluations conducted on multiple benchmarks, we demonstrate that our method\nachieves desired levels of sparsity while maintaining competitive accuracy.\n","authors":["Sunil Mathew","Daniel B. Rowe"],"pdf_url":"https://arxiv.org/pdf/2308.02451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02448v1","updated":"2023-08-04T16:22:06Z","published":"2023-08-04T16:22:06Z","title":"From Military to Healthcare: Adopting and Expanding Ethical Principles\n  for Generative Artificial Intelligence","summary":"  In 2020, the U.S. Department of Defense officially disclosed a set of ethical\nprinciples to guide the use of Artificial Intelligence (AI) technologies on\nfuture battlefields. Despite stark differences, there are core similarities\nbetween the military and medical service. Warriors on battlefields often face\nlife-altering circumstances that require quick decision-making. Medical\nproviders experience similar challenges in a rapidly changing healthcare\nenvironment, such as in the emergency department or during surgery treating a\nlife-threatening condition. Generative AI, an emerging technology designed to\nefficiently generate valuable information, holds great promise. As computing\npower becomes more accessible and the abundance of health data, such as\nelectronic health records, electrocardiograms, and medical images, increases,\nit is inevitable that healthcare will be revolutionized by this technology.\nRecently, generative AI has captivated the research community, leading to\ndebates about its application in healthcare, mainly due to concerns about\ntransparency and related issues. Meanwhile, concerns about the potential\nexacerbation of health disparities due to modeling biases have raised notable\nethical concerns regarding the use of this technology in healthcare. However,\nthe ethical principles for generative AI in healthcare have been understudied,\nand decision-makers often fail to consider the significance of generative AI.\nIn this paper, we propose GREAT PLEA ethical principles, encompassing\ngovernance, reliability, equity, accountability, traceability, privacy,\nlawfulness, empathy, and autonomy, for generative AI in healthcare. We aim to\nproactively address the ethical dilemmas and challenges posed by the\nintegration of generative AI in healthcare.\n","authors":["David Oniani","Jordan Hilsman","Yifan Peng"," COL","Ronald K. Poropatich","COL Jeremy C. Pamplin","LTC Gary L. Legault","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15539v3","updated":"2023-08-04T16:16:28Z","published":"2023-07-28T13:07:42Z","title":"Beating Backdoor Attack at Its Own Game","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not\naffect the network's performance on clean data but would manipulate the network\nbehavior once a trigger pattern is added. Existing defense methods have greatly\nreduced attack success rate, but their prediction accuracy on clean data still\nlags behind a clean model by a large margin. Inspired by the stealthiness and\neffectiveness of backdoor attack, we propose a simple but highly effective\ndefense framework which injects non-adversarial backdoors targeting poisoned\nsamples. Following the general steps in backdoor attack, we detect a small set\nof suspected samples and then apply a poisoning strategy to them. The\nnon-adversarial backdoor, once triggered, suppresses the attacker's backdoor on\npoisoned data, but has limited influence on clean data. The defense can be\ncarried out during data preprocessing, without any modification to the standard\nend-to-end training pipeline. We conduct extensive experiments on multiple\nbenchmarks with different architectures and representative attacks. Results\ndemonstrate that our method achieves state-of-the-art defense effectiveness\nwith by far the lowest performance drop on clean data. Considering the\nsurprising defense ability displayed by our framework, we call for more\nattention to utilizing backdoor for backdoor defense. Code is available at\nhttps://github.com/damianliumin/non-adversarial_backdoor.\n","authors":["Min Liu","Alberto Sangiovanni-Vincentelli","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.15539v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02442v1","updated":"2023-08-04T16:14:43Z","published":"2023-08-04T16:14:43Z","title":"Adaptive Preferential Attached kNN Graph With Distribution-Awareness","summary":"  Graph-based kNN algorithms have garnered widespread popularity for machine\nlearning tasks, due to their simplicity and effectiveness. However, the\nconventional kNN graph's reliance on a fixed value of k can hinder its\nperformance, especially in scenarios involving complex data distributions.\nMoreover, like other classification models, the presence of ambiguous samples\nalong decision boundaries often presents a challenge, as they are more prone to\nincorrect classification. To address these issues, we propose the Preferential\nAttached k-Nearest Neighbors Graph (paNNG), which combines adaptive kNN with\ndistribution-based graph construction. By incorporating distribution\ninformation, paNNG can significantly improve performance for ambiguous samples\nby \"pulling\" them towards their original classes and hence enable enhanced\noverall accuracy and generalization capability. Through rigorous evaluations on\ndiverse benchmark datasets, paNNG outperforms state-of-the-art algorithms,\nshowcasing its adaptability and efficacy across various real-world scenarios.\n","authors":["Shaojie Min","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.09195v3","updated":"2023-08-04T16:01:02Z","published":"2021-12-16T20:42:07Z","title":"Mitigating the Bias of Centered Objects in Common Datasets","summary":"  Convolutional networks are considered shift invariant, but it was\ndemonstrated that their response may vary according to the exact location of\nthe objects. In this paper we will demonstrate that most commonly investigated\ndatasets have a bias, where objects are over-represented at the center of the\nimage during training. This bias and the boundary condition of these networks\ncan have a significant effect on the performance of these architectures and\ntheir accuracy drops significantly as an object approaches the boundary. We\nwill also demonstrate how this effect can be mitigated with data augmentation\ntechniques.\n","authors":["Gergely Szabo","Andras Horvath"],"pdf_url":"https://arxiv.org/pdf/2112.09195v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19148v3","updated":"2023-08-04T15:43:19Z","published":"2023-05-28T15:37:39Z","title":"Mitigating Label Biases for In-context Learning","summary":"  Various design settings for in-context learning (ICL), such as the choice and\norder of the in-context examples, can bias a model toward a particular\nprediction without being reflective of an understanding of the task. While many\nstudies discuss these design choices, there have been few systematic\ninvestigations into categorizing them and mitigating their impact. In this\nwork, we define a typology for three types of label biases in ICL for text\nclassification: vanilla-label bias, context-label bias, and domain-label bias\n(which we conceptualize and detect for the first time).\n  Our analysis demonstrates that prior label bias calibration methods fall\nshort of addressing all three types of biases. Specifically, domain-label bias\nrestricts LLMs to random-level performance on many tasks regardless of the\nchoice of in-context examples. To mitigate the effect of these biases, we\npropose a simple bias calibration method that estimates a language model's\nlabel bias using random in-domain words from the task corpus. After controlling\nfor this estimated bias when making predictions, our novel domain-context\ncalibration significantly improves the ICL performance of GPT-J and GPT-3 on a\nwide range of tasks. The gain is substantial on tasks with large domain-label\nbias (up to 37% in Macro-F1). Furthermore, our results generalize to models\nwith different scales, pretraining methods, and manually-designed task\ninstructions, showing the prevalence of label biases in ICL.\n","authors":["Yu Fei","Yifan Hou","Zeming Chen","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2305.19148v3.pdf","comment":"Accepted to ACL 2023"},{"id":"http://arxiv.org/abs/2308.02391v1","updated":"2023-08-04T15:40:23Z","published":"2023-08-04T15:40:23Z","title":"Learning Optimal Admission Control in Partially Observable Queueing\n  Networks","summary":"  We present an efficient reinforcement learning algorithm that learns the\noptimal admission control policy in a partially observable queueing network.\nSpecifically, only the arrival and departure times from the network are\nobservable, and optimality refers to the average holding/rejection cost in\ninfinite horizon.\n  While reinforcement learning in Partially Observable Markov Decision\nProcesses (POMDP) is prohibitively expensive in general, we show that our\nalgorithm has a regret that only depends sub-linearly on the maximal number of\njobs in the network, $S$. In particular, in contrast with existing regret\nanalyses, our regret bound does not depend on the diameter of the underlying\nMarkov Decision Process (MDP), which in most queueing systems is at least\nexponential in $S$.\n  The novelty of our approach is to leverage Norton's equivalent theorem for\nclosed product-form queueing networks and an efficient reinforcement learning\nalgorithm for MDPs with the structure of birth-and-death processes.\n","authors":["Jonatha Anselmi","Bruno Gaujal","Louis-Sébastien Rebuffi"],"pdf_url":"https://arxiv.org/pdf/2308.02391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02382v1","updated":"2023-08-04T15:25:56Z","published":"2023-08-04T15:25:56Z","title":"Scaling Survival Analysis in Healthcare with Federated Survival Forests:\n  A Comparative Study on Heart Failure and Breast Cancer Genomics","summary":"  Survival analysis is a fundamental tool in medicine, modeling the time until\nan event of interest occurs in a population. However, in real-world\napplications, survival data are often incomplete, censored, distributed, and\nconfidential, especially in healthcare settings where privacy is critical. The\nscarcity of data can severely limit the scalability of survival models to\ndistributed applications that rely on large data pools. Federated learning is a\npromising technique that enables machine learning models to be trained on\nmultiple datasets without compromising user privacy, making it particularly\nwell-suited for addressing the challenges of survival data and large-scale\nsurvival applications. Despite significant developments in federated learning\nfor classification and regression, many directions remain unexplored in the\ncontext of survival analysis. In this work, we propose an extension of the\nFederated Survival Forest algorithm, called FedSurF++. This federated ensemble\nmethod constructs random survival forests in heterogeneous federations.\nSpecifically, we investigate several new tree sampling methods from client\nforests and compare the results with state-of-the-art survival models based on\nneural networks. The key advantage of FedSurF++ is its ability to achieve\ncomparable performance to existing methods while requiring only a single\ncommunication round to complete. The extensive empirical investigation results\nin a significant improvement from the algorithmic and privacy preservation\nperspectives, making the original FedSurF algorithm more efficient, robust, and\nprivate. We also present results on two real-world datasets demonstrating the\nsuccess of FedSurF++ in real-world healthcare studies. Our results underscore\nthe potential of FedSurF++ to improve the scalability and effectiveness of\nsurvival analysis in distributed settings while preserving user privacy.\n","authors":["Alberto Archetti","Francesca Ieva","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2308.02382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05237v2","updated":"2023-08-04T15:11:39Z","published":"2023-05-09T07:56:26Z","title":"Traffic Forecasting on New Roads Unseen in the Training Data Using\n  Spatial Contrastive Pre-Training","summary":"  New roads are being constructed all the time. However, the capabilities of\nprevious deep forecasting models to generalize to new roads not seen in the\ntraining data (unseen roads) are rarely explored. In this paper, we introduce a\nnovel setup called a spatio-temporal (ST) split to evaluate the models'\ncapabilities to generalize to unseen roads. In this setup, the models are\ntrained on data from a sample of roads, but tested on roads not seen in the\ntraining data. Moreover, we also present a novel framework called Spatial\nContrastive Pre-Training (SCPT) where we introduce a spatial encoder module to\nextract latent features from unseen roads during inference time. This spatial\nencoder is pre-trained using contrastive learning. During inference, the\nspatial encoder only requires two days of traffic data on the new roads and\ndoes not require any re-training. We also show that the output from the spatial\nencoder can be used effectively to infer latent node embeddings on unseen roads\nduring inference time. The SCPT framework also incorporates a new layer, named\nthe spatially gated addition (SGA) layer, to effectively combine the latent\nfeatures from the output of the spatial encoder to existing backbones.\nAdditionally, since there is limited data on the unseen roads, we argue that it\nis better to decouple traffic signals to trivial-to-capture periodic signals\nand difficult-to-capture Markovian signals, and for the spatial encoder to only\nlearn the Markovian signals. Finally, we empirically evaluated SCPT using the\nST split setup on four real-world datasets. The results showed that adding SCPT\nto a backbone consistently improves forecasting performance on unseen roads.\nMore importantly, the improvements are greater when forecasting further into\nthe future. The codes are available on GitHub:\n\\burl{https://github.com/cruiseresearchgroup/forecasting-on-new-roads}.\n","authors":["Arian Prabowo","Wei Shao","Hao Xue","Piotr Koniusz","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2305.05237v2.pdf","comment":"25 pages including reference, an additional 3 pages of appendix, 8\n  figures"},{"id":"http://arxiv.org/abs/2308.02370v1","updated":"2023-08-04T15:10:07Z","published":"2023-08-04T15:10:07Z","title":"A Machine Learning Method for Predicting Traffic Signal Timing from\n  Probe Vehicle Data","summary":"  Traffic signals play an important role in transportation by enabling traffic\nflow management, and ensuring safety at intersections. In addition, knowing the\ntraffic signal phase and timing data can allow optimal vehicle routing for time\nand energy efficiency, eco-driving, and the accurate simulation of signalized\nroad networks. In this paper, we present a machine learning (ML) method for\nestimating traffic signal timing information from vehicle probe data. To the\nauthors best knowledge, very few works have presented ML techniques for\ndetermining traffic signal timing parameters from vehicle probe data. In this\nwork, we develop an Extreme Gradient Boosting (XGBoost) model to estimate\nsignal cycle lengths and a neural network model to determine the corresponding\nred times per phase from probe data. The green times are then be derived from\nthe cycle length and red times. Our results show an error of less than 0.56 sec\nfor cycle length, and red times predictions within 7.2 sec error on average.\n","authors":["Juliette Ugirumurera","Joseph Severino","Erik A. Bensen","Qichao Wang","Jane Macfarlane"],"pdf_url":"https://arxiv.org/pdf/2308.02370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13498v2","updated":"2023-08-04T15:05:24Z","published":"2023-05-22T21:28:57Z","title":"Parameter estimation from an Ornstein-Uhlenbeck process with measurement\n  noise","summary":"  This article aims to investigate the impact of noise on parameter fitting for\nan Ornstein-Uhlenbeck process, focusing on the effects of multiplicative and\nthermal noise on the accuracy of signal separation. To address these issues, we\npropose algorithms and methods that can effectively distinguish between thermal\nand multiplicative noise and improve the precision of parameter estimation for\noptimal data analysis. Specifically, we explore the impact of both\nmultiplicative and thermal noise on the obfuscation of the actual signal and\npropose methods to resolve them. Firstly, we present an algorithm that can\neffectively separate thermal noise with comparable performance to Hamilton\nMonte Carlo (HMC) but with significantly improved speed. Subsequently, we\nanalyze multiplicative noise and demonstrate that HMC is insufficient for\nisolating thermal and multiplicative noise. However, we show that, with\nadditional knowledge of the ratio between thermal and multiplicative noise, we\ncan accurately distinguish between the two types of noise when provided with a\nsufficiently large sampling rate or an amplitude of multiplicative noise\nsmaller than thermal noise. This finding results in a situation that initially\nseems counterintuitive. When multiplicative noise dominates the noise spectrum,\nwe can successfully estimate the parameters for such systems after adding\nadditional white noise to shift the noise balance.\n","authors":["Simon Carter","Helmut H. Strey"],"pdf_url":"https://arxiv.org/pdf/2305.13498v2.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2003.09168v4","updated":"2023-08-04T14:53:42Z","published":"2020-03-20T10:03:01Z","title":"Fine-grained Species Recognition with Privileged Pooling: Better Sample\n  Efficiency Through Supervised Attention","summary":"  We propose a scheme for supervised image classification that uses privileged\ninformation, in the form of keypoint annotations for the training data, to\nlearn strong models from small and/or biased training sets. Our main motivation\nis the recognition of animal species for ecological applications such as\nbiodiversity modelling, which is challenging because of long-tailed species\ndistributions due to rare species, and strong dataset biases such as repetitive\nscene background in camera traps. To counteract these challenges, we propose a\nvisual attention mechanism that is supervised via keypoint annotations that\nhighlight important object parts. This privileged information, implemented as a\nnovel privileged pooling operation, is only required during training and helps\nthe model to focus on regions that are discriminative. In experiments with\nthree different animal species datasets, we show that deep networks with\nprivileged pooling can use small training sets more efficiently and generalize\nbetter.\n","authors":["Andres C. Rodriguez","Stefano D'Aronco","Konrad Schindler","Jan Dirk Wegner"],"pdf_url":"https://arxiv.org/pdf/2003.09168v4.pdf","comment":"Updated version with iNaturalist2018 dataset. privileged pooling,\n  supervised attention, training set bias, fine-grained species recognition,\n  camera trap images"},{"id":"http://arxiv.org/abs/2308.02360v1","updated":"2023-08-04T14:52:22Z","published":"2023-08-04T14:52:22Z","title":"Intensity-free Integral-based Learning of Marked Temporal Point\n  Processes","summary":"  In the marked temporal point processes (MTPP), a core problem is to\nparameterize the conditional joint PDF (probability distribution function)\n$p^*(m,t)$ for inter-event time $t$ and mark $m$, conditioned on the history.\nThe majority of existing studies predefine intensity functions. Their utility\nis challenged by specifying the intensity function's proper form, which is\ncritical to balance expressiveness and processing efficiency. Recently, there\nare studies moving away from predefining the intensity function -- one models\n$p^*(t)$ and $p^*(m)$ separately, while the other focuses on temporal point\nprocesses (TPPs), which do not consider marks. This study aims to develop\nhigh-fidelity $p^*(m,t)$ for discrete events where the event marks are either\ncategorical or numeric in a multi-dimensional continuous space. We propose a\nsolution framework IFIB (\\underline{I}ntensity-\\underline{f}ree\n\\underline{I}ntegral-\\underline{b}ased process) that models conditional joint\nPDF $p^*(m,t)$ directly without intensity functions. It remarkably simplifies\nthe process to compel the essential mathematical restrictions. We show the\ndesired properties of IFIB and the superior experimental results of IFIB on\nreal-world and synthetic datasets. The code is available at\n\\url{https://github.com/StepinSilence/IFIB}.\n","authors":["Sishun Liu","Ke Deng","Jenny Zhang","Yongli Ren"],"pdf_url":"https://arxiv.org/pdf/2308.02360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02353v1","updated":"2023-08-04T14:41:03Z","published":"2023-08-04T14:41:03Z","title":"Adapting to Change: Robust Counterfactual Explanations in Dynamic Data\n  Landscapes","summary":"  We introduce a novel semi-supervised Graph Counterfactual Explainer (GCE)\nmethodology, Dynamic GRAph Counterfactual Explainer (DyGRACE). It leverages\ninitial knowledge about the data distribution to search for valid\ncounterfactuals while avoiding using information from potentially outdated\ndecision functions in subsequent time steps. Employing two graph autoencoders\n(GAEs), DyGRACE learns the representation of each class in a binary\nclassification scenario. The GAEs minimise the reconstruction error between the\noriginal graph and its learned representation during training. The method\ninvolves (i) optimising a parametric density function (implemented as a\nlogistic regression function) to identify counterfactuals by maximising the\nfactual autoencoder's reconstruction error, (ii) minimising the counterfactual\nautoencoder's error, and (iii) maximising the similarity between the factual\nand counterfactual graphs. This semi-supervised approach is independent of an\nunderlying black-box oracle. A logistic regression model is trained on a set of\ngraph pairs to learn weights that aid in finding counterfactuals. At inference,\nfor each unseen graph, the logistic regressor identifies the best\ncounterfactual candidate using these learned weights, while the GAEs can be\niteratively updated to represent the continual adaptation of the learned graph\nrepresentation over iterations. DyGRACE is quite effective and can act as a\ndrift detector, identifying distributional drift based on differences in\nreconstruction errors between iterations. It avoids reliance on the oracle's\npredictions in successive iterations, thereby increasing the efficiency of\ncounterfactual discovery. DyGRACE, with its capacity for contrastive learning\nand drift detection, will offer new avenues for semi-supervised learning and\nexplanation generation.\n","authors":["Bardh Prenkaj","Mario Villaizan-Vallelado","Tobias Leemann","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2308.02353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02350v1","updated":"2023-08-04T14:37:12Z","published":"2023-08-04T14:37:12Z","title":"RobustMQ: Benchmarking Robustness of Quantized Models","summary":"  Quantization has emerged as an essential technique for deploying deep neural\nnetworks (DNNs) on devices with limited resources. However, quantized models\nexhibit vulnerabilities when exposed to various noises in real-world\napplications. Despite the importance of evaluating the impact of quantization\non robustness, existing research on this topic is limited and often disregards\nestablished principles of robustness evaluation, resulting in incomplete and\ninconclusive findings. To address this gap, we thoroughly evaluated the\nrobustness of quantized models against various noises (adversarial attacks,\nnatural corruptions, and systematic noises) on ImageNet. The comprehensive\nevaluation results empirically provide valuable insights into the robustness of\nquantized models in various scenarios, for example: (1) quantized models\nexhibit higher adversarial robustness than their floating-point counterparts,\nbut are more vulnerable to natural corruptions and systematic noises; (2) in\ngeneral, increasing the quantization bit-width results in a decrease in\nadversarial robustness, an increase in natural robustness, and an increase in\nsystematic robustness; (3) among corruption methods, \\textit{impulse noise} and\n\\textit{glass blur} are the most harmful to quantized models, while\n\\textit{brightness} has the least impact; (4) among systematic noises, the\n\\textit{nearest neighbor interpolation} has the highest impact, while bilinear\ninterpolation, cubic interpolation, and area interpolation are the three least\nharmful. Our research contributes to advancing the robust quantization of\nmodels and their deployment in real-world scenarios.\n","authors":["Yisong Xiao","Aishan Liu","Tianyuan Zhang","Haotong Qin","Jinyang Guo","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02350v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2211.01021v3","updated":"2023-08-04T14:34:23Z","published":"2022-11-02T10:33:38Z","title":"Data-driven modeling of Landau damping by physics-informed neural\n  networks","summary":"  Kinetic approaches are generally accurate in dealing with microscale plasma\nphysics problems but are computationally expensive for large-scale or\nmultiscale systems. One of the long-standing problems in plasma physics is the\nintegration of kinetic physics into fluid models, which is often achieved\nthrough sophisticated analytical closure terms. In this paper, we successfully\nconstruct a multi-moment fluid model with an implicit fluid closure included in\nthe neural network using machine learning. The multi-moment fluid model is\ntrained with a small fraction of sparsely sampled data from kinetic simulations\nof Landau damping, using the physics-informed neural network (PINN) and the\ngradient-enhanced physics-informed neural network (gPINN). The multi-moment\nfluid model constructed using either PINN or gPINN reproduces the time\nevolution of the electric field energy, including its damping rate, and the\nplasma dynamics from the kinetic simulations. In addition, we introduce a\nvariant of the gPINN architecture, namely, gPINN$p$ to capture the Landau\ndamping process. Instead of including the gradients of all the equation\nresiduals, gPINN$p$ only adds the gradient of the pressure equation residual as\none additional constraint. Among the three approaches, the gPINN$p$-constructed\nmulti-moment fluid model offers the most accurate results. This work sheds\nlight on the accurate and efficient modeling of large-scale systems, which can\nbe extended to complex multiscale laboratory, space, and astrophysical plasma\nphysics problems.\n","authors":["Yilan Qin","Jiayu Ma","Mingle Jiang","Chuanfei Dong","Haiyang Fu","Liang Wang","Wenjie Cheng","Yaqiu Jin"],"pdf_url":"https://arxiv.org/pdf/2211.01021v3.pdf","comment":"11 pages, 7 figures, accepted for publication in Physical Review\n  Research"},{"id":"http://arxiv.org/abs/2303.00516v2","updated":"2023-08-04T14:22:02Z","published":"2023-02-28T13:22:29Z","title":"Exploiting Multiple Abstractions in Episodic RL via Reward Shaping","summary":"  One major limitation to the applicability of Reinforcement Learning (RL) to\nmany practical domains is the large number of samples required to learn an\noptimal policy. To address this problem and improve learning efficiency, we\nconsider a linear hierarchy of abstraction layers of the Markov Decision\nProcess (MDP) underlying the target domain. Each layer is an MDP representing a\ncoarser model of the one immediately below in the hierarchy. In this work, we\npropose a novel form of Reward Shaping where the solution obtained at the\nabstract level is used to offer rewards to the more concrete MDP, in such a way\nthat the abstract solution guides the learning in the more complex domain. In\ncontrast with other works in Hierarchical RL, our technique has few\nrequirements in the design of the abstract models and it is also tolerant to\nmodeling errors, thus making the proposed approach practical. We formally\nanalyze the relationship between the abstract models and the exploration\nheuristic induced in the lower-level domain. Moreover, we prove that the method\nguarantees optimal convergence and we demonstrate its effectiveness\nexperimentally.\n","authors":["Roberto Cipollone","Giuseppe De Giacomo","Marco Favorito","Luca Iocchi","Fabio Patrizi"],"pdf_url":"https://arxiv.org/pdf/2303.00516v2.pdf","comment":"This is an extended version of the paper presented at AAAI 2023,\n  https://doi.org/10.1609/aaai.v37i6.25881"},{"id":"http://arxiv.org/abs/2308.02347v1","updated":"2023-08-04T14:21:02Z","published":"2023-08-04T14:21:02Z","title":"Stability and Generalization of Hypergraph Collaborative Networks","summary":"  Graph neural networks have been shown to be very effective in utilizing\npairwise relationships across samples. Recently, there have been several\nsuccessful proposals to generalize graph neural networks to hypergraph neural\nnetworks to exploit more complex relationships. In particular, the hypergraph\ncollaborative networks yield superior results compared to other hypergraph\nneural networks for various semi-supervised learning tasks. The collaborative\nnetwork can provide high quality vertex embeddings and hyperedge embeddings\ntogether by formulating them as a joint optimization problem and by using their\nconsistency in reconstructing the given hypergraph. In this paper, we aim to\nestablish the algorithmic stability of the core layer of the collaborative\nnetwork and provide generalization guarantees. The analysis sheds light on the\ndesign of hypergraph filters in collaborative networks, for instance, how the\ndata and hypergraph filters should be scaled to achieve uniform stability of\nthe learning process. Some experimental results on real-world datasets are\npresented to illustrate the theory.\n","authors":["Michael Ng","Hanrui Wu","Andy Yip"],"pdf_url":"https://arxiv.org/pdf/2308.02347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02344v1","updated":"2023-08-04T14:18:39Z","published":"2023-08-04T14:18:39Z","title":"Learning Networks from Gaussian Graphical Models and Gaussian Free\n  Fields","summary":"  We investigate the problem of estimating the structure of a weighted network\nfrom repeated measurements of a Gaussian Graphical Model (GGM) on the network.\nIn this vein, we consider GGMs whose covariance structures align with the\ngeometry of the weighted network on which they are based. Such GGMs have been\nof longstanding interest in statistical physics, and are referred to as the\nGaussian Free Field (GFF). In recent years, they have attracted considerable\ninterest in the machine learning and theoretical computer science. In this\nwork, we propose a novel estimator for the weighted network (equivalently, its\nLaplacian) from repeated measurements of a GFF on the network, based on the\nFourier analytic properties of the Gaussian distribution. In this pursuit, our\napproach exploits complex-valued statistics constructed from observed data,\nthat are of interest on their own right. We demonstrate the effectiveness of\nour estimator with concrete recovery guarantees and bounds on the required\nsample complexity. In particular, we show that the proposed statistic achieves\nthe parametric rate of estimation for fixed network size. In the setting of\nnetworks growing with sample size, our results show that for Erdos-Renyi random\ngraphs $G(d,p)$ above the connectivity threshold, we demonstrate that network\nrecovery takes place with high probability as soon as the sample size $n$\nsatisfies $n \\gg d^4 \\log d \\cdot p^{-2}$.\n","authors":["Subhro Ghosh","Soumendu Sundar Mukherjee","Hoang-Son Tran","Ujan Gangopadhyay"],"pdf_url":"https://arxiv.org/pdf/2308.02344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02335v1","updated":"2023-08-04T14:06:44Z","published":"2023-08-04T14:06:44Z","title":"RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph\n  Classification","summary":"  Graph classification is a crucial task in many real-world multimedia\napplications, where graphs can represent various multimedia data types such as\nimages, videos, and social networks. Previous efforts have applied graph neural\nnetworks (GNNs) in balanced situations where the class distribution is\nbalanced. However, real-world data typically exhibit long-tailed class\ndistributions, resulting in a bias towards the head classes when using GNNs and\nlimited generalization ability over the tail classes. Recent approaches mainly\nfocus on re-balancing different classes during model training, which fails to\nexplicitly introduce new knowledge and sacrifices the performance of the head\nclasses. To address these drawbacks, we propose a novel framework called\nRetrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature\nextractor and an unbiased classifier in a decoupled manner. In the feature\nextractor training stage, we develop a graph retrieval module to search for\nrelevant graphs that directly enrich the intra-class diversity for the tail\nclasses. Moreover, we innovatively optimize a category-centered supervised\ncontrastive loss to obtain discriminative representations, which is more\nsuitable for long-tailed scenarios. In the classifier fine-tuning stage, we\nbalance the classifier weights with two weight regularization techniques, i.e.,\nMax-norm and weight decay. Experiments on various popular benchmarks verify the\nsuperiority of the proposed method against state-of-the-art approaches.\n","authors":["Zhengyang Mao","Wei Ju","Yifang Qin","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02335v1.pdf","comment":"Accepted by the ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2210.13179v3","updated":"2023-08-04T13:26:21Z","published":"2022-10-24T13:00:15Z","title":"A simple probabilistic neural networks for machine understanding","summary":"  We discuss probabilistic neural networks for unsupervised learning with a\nfixed internal representation as models for machine understanding. Here\nunderstanding is intended as mapping data to an already existing representation\nwhich encodes an {\\em a priori} organisation of the feature space. We derive\nthe internal representation by requiring that it satisfies the principles of\nmaximal relevance and of maximal ignorance about how different features are\ncombined. We show that, when hidden units are binary variables, these two\nprinciples identify a unique model -- the Hierarchical Feature Model (HFM) --\nwhich is fully solvable and provides a natural interpretation in terms of\nfeatures. We argue that learning machines with this architecture enjoy a number\nof interesting properties, like the continuity of the representation with\nrespect to changes in parameters and data, the possibility to control the level\nof compression and the ability to support functions that go beyond\ngeneralisation. We explore the behaviour of the model with extensive numerical\nexperiments and argue that models where the internal representation is fixed\nreproduce a learning modality which is qualitatively different from that of\nmore traditional models such as Restricted Boltzmann Machines.\n","authors":["Rongrong Xie","Matteo Marsili"],"pdf_url":"https://arxiv.org/pdf/2210.13179v3.pdf","comment":"34 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.02293v1","updated":"2023-08-04T12:57:13Z","published":"2023-08-04T12:57:13Z","title":"A stochastic optimization approach to train non-linear neural networks\n  with regularization of higher-order total variation","summary":"  While highly expressive parametric models including deep neural networks have\nan advantage to model complicated concepts, training such highly non-linear\nmodels is known to yield a high risk of notorious overfitting. To address this\nissue, this study considers a $k$th order total variation ($k$-TV)\nregularization, which is defined as the squared integral of the $k$th order\nderivative of the parametric models to be trained; penalizing the $k$-TV is\nexpected to yield a smoother function, which is expected to avoid overfitting.\nWhile the $k$-TV terms applied to general parametric models are computationally\nintractable due to the integration, this study provides a stochastic\noptimization algorithm, that can efficiently train general models with the\n$k$-TV regularization without conducting explicit numerical integration. The\nproposed approach can be applied to the training of even deep neural networks\nwhose structure is arbitrary, as it can be implemented by only a simple\nstochastic gradient descent algorithm and automatic differentiation. Our\nnumerical experiments demonstrate that the neural networks trained with the\n$K$-TV terms are more ``resilient'' than those with the conventional parameter\nregularization. The proposed algorithm also can be extended to the\nphysics-informed training of neural networks (PINNs).\n","authors":["Akifumi Okuno"],"pdf_url":"https://arxiv.org/pdf/2308.02293v1.pdf","comment":"13 pages, 24 figures, in preparation for submission; comments are\n  welcome!"},{"id":"http://arxiv.org/abs/2205.07871v3","updated":"2023-08-04T12:54:36Z","published":"2022-05-12T15:35:03Z","title":"Mondrian Forest for Data Stream Classification Under Memory Constraints","summary":"  Supervised learning algorithms generally assume the availability of enough\nmemory to store their data model during the training and test phases. However,\nin the Internet of Things, this assumption is unrealistic when data comes in\nthe form of infinite data streams, or when learning algorithms are deployed on\ndevices with reduced amounts of memory. In this paper, we adapt the online\nMondrian forest classification algorithm to work with memory constraints on\ndata streams. In particular, we design five out-of-memory strategies to update\nMondrian trees with new data points when the memory limit is reached. Moreover,\nwe design trimming mechanisms to make Mondrian trees more robust to concept\ndrifts under memory constraints. We evaluate our algorithms on a variety of\nreal and simulated datasets, and we conclude with recommendations on their use\nin different situations: the Extend Node strategy appears as the best\nout-of-memory strategy in all configurations, whereas different trimming\nmechanisms should be adopted depending on whether a concept drift is expected.\nAll our methods are implemented in the OrpailleCC open-source library and are\nready to be used on embedded systems and connected objects.\n","authors":["Martin Khannouz","Tristan Glatard"],"pdf_url":"https://arxiv.org/pdf/2205.07871v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15980v2","updated":"2023-08-04T12:46:33Z","published":"2023-07-29T13:02:45Z","title":"Initial State Interventions for Deconfounded Imitation Learning","summary":"  Imitation learning suffers from causal confusion. This phenomenon occurs when\nlearned policies attend to features that do not causally influence the expert\nactions but are instead spuriously correlated. Causally confused agents produce\nlow open-loop supervised loss but poor closed-loop performance upon deployment.\nWe consider the problem of masking observed confounders in a disentangled\nrepresentation of the observation space. Our novel masking algorithm leverages\nthe usual ability to intervene in the initial system state, avoiding any\nrequirement involving expert querying, expert reward functions, or causal graph\nspecification. Under certain assumptions, we theoretically prove that this\nalgorithm is conservative in the sense that it does not incorrectly mask\nobservations that causally influence the expert; furthermore, intervening on\nthe initial state serves to strictly reduce excess conservatism. The masking\nalgorithm is applied to behavior cloning for two illustrative control systems:\nCartPole and Reacher.\n","authors":["Samuel Pfrommer","Yatong Bai","Hyunin Lee","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2307.15980v2.pdf","comment":"62nd IEEE Conference on Decision and Control"},{"id":"http://arxiv.org/abs/2308.02287v1","updated":"2023-08-04T12:43:54Z","published":"2023-08-04T12:43:54Z","title":"Frustratingly Easy Model Generalization by Dummy Risk Minimization","summary":"  Empirical risk minimization (ERM) is a fundamental machine learning paradigm.\nHowever, its generalization ability is limited in various tasks. In this paper,\nwe devise Dummy Risk Minimization (DuRM), a frustratingly easy and general\ntechnique to improve the generalization of ERM. DuRM is extremely simple to\nimplement: just enlarging the dimension of the output logits and then\noptimizing using standard gradient descent. Moreover, we validate the efficacy\nof DuRM on both theoretical and empirical analysis. Theoretically, we show that\nDuRM derives greater variance of the gradient, which facilitates model\ngeneralization by observing better flat local minima. Empirically, we conduct\nevaluations of DuRM across different datasets, modalities, and network\narchitectures on diverse tasks, including conventional classification, semantic\nsegmentation, out-of-distribution generalization, adverserial training, and\nlong-tailed recognition. Results demonstrate that DuRM could consistently\nimprove the performance under all tasks with an almost free lunch manner.\nFurthermore, we show that DuRM is compatible with existing generalization\ntechniques and we discuss possible limitations. We hope that DuRM could trigger\nnew interest in the fundamental research on risk minimization.\n","authors":["Juncheng Wang","Jindong Wang","Xixu Hu","Shujun Wang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02287v1.pdf","comment":"Technical report; 22 pages"},{"id":"http://arxiv.org/abs/2307.11608v2","updated":"2023-08-04T12:40:59Z","published":"2023-07-21T14:25:06Z","title":"Learning minimal representations of stochastic processes with\n  variational autoencoders","summary":"  Stochastic processes have found numerous applications in science, as they are\nbroadly used to model a variety of natural phenomena. Due to their intrinsic\nrandomness and uncertainty, they are however difficult to characterize. Here,\nwe introduce an unsupervised machine learning approach to determine the minimal\nset of parameters required to effectively describe the dynamics of a stochastic\nprocess. Our method builds upon an extended $\\beta$-variational autoencoder\narchitecture. By means of simulated datasets corresponding to paradigmatic\ndiffusion models, we showcase its effectiveness in extracting the minimal\nrelevant parameters that accurately describe these dynamics. Furthermore, the\nmethod enables the generation of new trajectories that faithfully replicate the\nexpected stochastic behavior. Overall, our approach enables for the autonomous\ndiscovery of unknown parameters describing stochastic processes, hence\nenhancing our comprehension of complex phenomena across various fields.\n","authors":["Gabriel Fernández-Fernández","Carlo Manzo","Maciej Lewenstein","Alexandre Dauphin","Gorka Muñoz-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.11608v2.pdf","comment":"9 pages, 5 figures, 1 table. Code available at\n  https://github.com/GabrielFernandezFernandez/SPIVAE . Corrected a reference,\n  a typographical error in the appendix, and acknowledgments"},{"id":"http://arxiv.org/abs/2308.02282v1","updated":"2023-08-04T12:27:11Z","published":"2023-08-04T12:27:11Z","title":"DIVERSIFY: A General Framework for Time Series Out-of-distribution\n  Detection and Generalization","summary":"  Time series remains one of the most challenging modalities in machine\nlearning research. The out-of-distribution (OOD) detection and generalization\non time series tend to suffer due to its non-stationary property, i.e., the\ndistribution changes over time. The dynamic distributions inside time series\npose great challenges to existing algorithms to identify invariant\ndistributions since they mainly focus on the scenario where the domain\ninformation is given as prior knowledge. In this paper, we attempt to exploit\nsubdomains within a whole dataset to counteract issues induced by\nnon-stationary for generalized representation learning. We propose DIVERSIFY, a\ngeneral framework, for OOD detection and generalization on dynamic\ndistributions of time series. DIVERSIFY takes an iterative process: it first\nobtains the \"worst-case\" latent distribution scenario via adversarial training,\nthen reduces the gap between these latent distributions. We implement DIVERSIFY\nvia combining existing OOD detection methods according to either extracted\nfeatures or outputs of models for detection while we also directly utilize\noutputs for classification. In addition, theoretical insights illustrate that\nDIVERSIFY is theoretically supported. Extensive experiments are conducted on\nseven datasets with different OOD settings across gesture recognition, speech\ncommands recognition, wearable stress and affect detection, and sensor-based\nhuman activity recognition. Qualitative and quantitative results demonstrate\nthat DIVERSIFY learns more generalized features and significantly outperforms\nother baselines.\n","authors":["Wang Lu","Jindong Wang","Xinwei Sun","Yiqiang Chen","Xiangyang Ji","Qiang Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02282v1.pdf","comment":"Journal version of arXiv:2209.07027; 17 pages"},{"id":"http://arxiv.org/abs/2302.11239v3","updated":"2023-08-04T11:59:47Z","published":"2023-02-22T09:39:59Z","title":"Explainable Contextual Anomaly Detection using Quantile Regression\n  Forests","summary":"  Traditional anomaly detection methods aim to identify objects that deviate\nfrom most other objects by treating all features equally. In contrast,\ncontextual anomaly detection methods aim to detect objects that deviate from\nother objects within a context of similar objects by dividing the features into\ncontextual features and behavioral features. In this paper, we develop\nconnections between dependency-based traditional anomaly detection methods and\ncontextual anomaly detection methods. Based on resulting insights, we propose a\nnovel approach to inherently interpretable contextual anomaly detection that\nuses Quantile Regression Forests to model dependencies between features.\nExtensive experiments on various synthetic and real-world datasets demonstrate\nthat our method outperforms state-of-the-art anomaly detection methods in\nidentifying contextual anomalies in terms of accuracy and interpretability.\n","authors":["Zhong Li","Matthijs van Leeuwen"],"pdf_url":"https://arxiv.org/pdf/2302.11239v3.pdf","comment":"Manuscript accepted by Data Mining and Knowledge Discovery for\n  publication (June 2023). This is the final revised version"},{"id":"http://arxiv.org/abs/2308.02261v1","updated":"2023-08-04T11:37:08Z","published":"2023-08-04T11:37:08Z","title":"Adaptive Proximal Gradient Method for Convex Optimization","summary":"  In this paper, we explore two fundamental first-order algorithms in convex\noptimization, namely, gradient descent (GD) and proximal gradient method\n(ProxGD). Our focus is on making these algorithms entirely adaptive by\nleveraging local curvature information of smooth functions. We propose adaptive\nversions of GD and ProxGD that are based on observed gradient differences and,\nthus, have no added computational costs. Moreover, we prove convergence of our\nmethods assuming only local Lipschitzness of the gradient. In addition, the\nproposed versions allow for even larger stepsizes than those initially\nsuggested in [MM20].\n","authors":["Yura Malitsky","Konstantin Mishchenko"],"pdf_url":"https://arxiv.org/pdf/2308.02261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02249v1","updated":"2023-08-04T11:13:15Z","published":"2023-08-04T11:13:15Z","title":"Finding Tori: Self-supervised Learning for Analyzing Korean Folk Song","summary":"  In this paper, we introduce a computational analysis of the field recording\ndataset of approximately 700 hours of Korean folk songs, which were recorded\naround 1980-90s. Because most of the songs were sung by non-expert musicians\nwithout accompaniment, the dataset provides several challenges. To address this\nchallenge, we utilized self-supervised learning with convolutional neural\nnetwork based on pitch contour, then analyzed how the musical concept of tori,\na classification system defined by a specific scale, ornamental notes, and an\nidiomatic melodic contour, is captured by the model. The experimental result\nshows that our approach can better capture the characteristics of tori compared\nto traditional pitch histograms. Using our approaches, we have examined how\nmusical discussions proposed in existing academia manifest in the actual field\nrecordings of Korean folk songs.\n","authors":["Danbinaerin Han","Rafael Caro Repetto","Dasaem Jeong"],"pdf_url":"https://arxiv.org/pdf/2308.02249v1.pdf","comment":"Accepted at 24th International Society for Music Information\n  Retrieval Conference (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2303.08757v3","updated":"2023-08-04T10:40:46Z","published":"2023-03-15T16:53:19Z","title":"CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in\n  Patients With Suspected Ischemic Stroke","summary":"  Precise and fast prediction methods for ischemic areas comprised of dead\ntissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)\npatients are of significant clinical interest. They play an essential role in\nimproving diagnosis and treatment planning. Computed Tomography (CT) scan is\none of the primary modalities for early assessment in patients with suspected\nAIS. CT Perfusion (CTP) is often used as a primary assessment to determine\nstroke location, severity, and volume of ischemic lesions. Current automatic\nsegmentation methods for CTP mostly use already processed 3D parametric maps\nconventionally used for clinical interpretation by radiologists as input.\nAlternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time\ninput, where the spatial information over the volume is ignored. In addition,\nthese methods are only interested in segmenting core regions, while predicting\npenumbra can be essential for treatment planning. This paper investigates\ndifferent methods to utilize the entire 4D CTP as input to fully exploit the\nspatio-temporal information, leading us to propose a novel 4D convolution\nlayer. Our comprehensive experiments on a local dataset of 152 patients divided\ninto three groups show that our proposed models generate more precise results\nthan other methods explored. Adopting the proposed 4D mJ-Net, a Dice\nCoefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core\nareas, respectively. The code is available on\nhttps://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.\n","authors":["Luca Tomasetti","Kjersti Engan","Liv Jorunn Høllesli","Kathinka Dæhli Kurz","Mahdieh Khanmohammadi"],"pdf_url":"https://arxiv.org/pdf/2303.08757v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02233v1","updated":"2023-08-04T10:17:25Z","published":"2023-08-04T10:17:25Z","title":"Self-Normalizing Neural Network, Enabling One Shot Transfer Learning for\n  Modeling EDFA Wavelength Dependent Gain","summary":"  We present a novel ML framework for modeling the wavelength-dependent gain of\nmultiple EDFAs, based on semi-supervised, self-normalizing neural networks,\nenabling one-shot transfer learning. Our experiments on 22 EDFAs in Open\nIreland and COSMOS testbeds show high-accuracy transfer-learning even when\noperated across different amplifier types.\n","authors":["Agastya Raj","Zehao Wang","Frank Slyne","Tingjun Chen","Dan Kilper","Marco Ruffini"],"pdf_url":"https://arxiv.org/pdf/2308.02233v1.pdf","comment":"This paper was accepted for the European Conference on Optical\n  Communications (ECOC) 2023, this version is a pre-print"},{"id":"http://arxiv.org/abs/2308.02221v1","updated":"2023-08-04T09:34:48Z","published":"2023-08-04T09:34:48Z","title":"Likelihood-ratio-based confidence intervals for neural networks","summary":"  This paper introduces a first implementation of a novel\nlikelihood-ratio-based approach for constructing confidence intervals for\nneural networks. Our method, called DeepLR, offers several qualitative\nadvantages: most notably, the ability to construct asymmetric intervals that\nexpand in regions with a limited amount of data, and the inherent incorporation\nof factors such as the amount of training time, network architecture, and\nregularization techniques. While acknowledging that the current implementation\nof the method is prohibitively expensive for many deep-learning applications,\nthe high cost may already be justified in specific fields like medical\npredictions or astrophysics, where a reliable uncertainty estimate for a single\nprediction is essential. This work highlights the significant potential of a\nlikelihood-ratio-based uncertainty estimate and establishes a promising avenue\nfor future research.\n","authors":["Laurens Sluijterman","Eric Cator","Tom Heskes"],"pdf_url":"https://arxiv.org/pdf/2308.02221v1.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2207.03890v2","updated":"2023-08-04T09:03:40Z","published":"2022-07-08T13:25:06Z","title":"ENCODE: Encoding NetFlows for Network Anomaly Detection","summary":"  NetFlow data is a popular network log format used by many network analysts\nand researchers. The advantages of using NetFlow over deep packet inspection\nare that it is easier to collect and process, and it is less privacy intrusive.\nMany works have used machine learning to detect network attacks using NetFlow\ndata. The first step for these machine learning pipelines is to pre-process the\ndata before it is given to the machine learning algorithm. Many approaches\nexist to pre-process NetFlow data; however, these simply apply existing methods\nto the data, not considering the specific properties of network data. We argue\nthat for data originating from software systems, such as NetFlow or software\nlogs, similarities in frequency and contexts of feature values are more\nimportant than similarities in the value itself. In this work, we propose an\nencoding algorithm that directly takes the frequency and the context of the\nfeature values into account when the data is being processed. Different types\nof network behaviours can be clustered using this encoding, thus aiding the\nprocess of detecting anomalies within the network. We train several machine\nlearning models for anomaly detection using the data that has been encoded with\nour encoding algorithm. We evaluate the effectiveness of our encoding on a new\ndataset that we created for network attacks on Kubernetes clusters and two\nwell-known public NetFlow datasets. We empirically demonstrate that the machine\nlearning models benefit from using our encoding for anomaly detection.\n","authors":["Clinton Cao","Annibale Panichella","Sicco Verwer","Agathe Blaise","Filippo Rebecchi"],"pdf_url":"https://arxiv.org/pdf/2207.03890v2.pdf","comment":"11 pages, 17 figures"},{"id":"http://arxiv.org/abs/2308.02199v1","updated":"2023-08-04T08:33:07Z","published":"2023-08-04T08:33:07Z","title":"A Survey of Spanish Clinical Language Models","summary":"  This survey focuses in encoder Language Models for solving tasks in the\nclinical domain in the Spanish language. We review the contributions of 17\ncorpora focused mainly in clinical tasks, then list the most relevant Spanish\nLanguage Models and Spanish Clinical Language models. We perform a thorough\ncomparison of these models by benchmarking them over a curated subset of the\navailable corpora, in order to find the best-performing ones; in total more\nthan 3000 models were fine-tuned for this study. All the tested corpora and the\nbest models are made publically available in an accessible way, so that the\nresults can be reproduced by independent teams or challenged in the future when\nnew Spanish Clinical Language models are created.\n","authors":["Guillem García Subies","Álvaro Barbero Jiménez","Paloma Martínez Fernández"],"pdf_url":"https://arxiv.org/pdf/2308.02199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16729v2","updated":"2023-08-04T08:13:14Z","published":"2023-05-26T08:30:51Z","title":"Evaluating generation of chaotic time series by convolutional generative\n  adversarial networks","summary":"  To understand the ability and limitations of convolutional neural networks to\ngenerate time series that mimic complex temporal signals, we trained a\ngenerative adversarial network consisting of deep convolutional networks to\ngenerate chaotic time series and used nonlinear time series analysis to\nevaluate the generated time series. A numerical measure of determinism and the\nLyapunov exponent, a measure of trajectory instability, showed that the\ngenerated time series well reproduce the chaotic properties of the original\ntime series. However, error distribution analyses showed that large errors\nappeared at a low but non-negligible rate. Such errors would not be expected if\nthe distribution were assumed to be exponential.\n","authors":["Yuki Tanaka","Yutaka Yamaguti"],"pdf_url":"https://arxiv.org/pdf/2305.16729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02182v1","updated":"2023-08-04T07:54:45Z","published":"2023-08-04T07:54:45Z","title":"AutoML4ETC: Automated Neural Architecture Search for Real-World\n  Encrypted Traffic Classification","summary":"  Deep learning (DL) has been successfully applied to encrypted network traffic\nclassification in experimental settings. However, in production use, it has\nbeen shown that a DL classifier's performance inevitably decays over time.\nRe-training the model on newer datasets has been shown to only partially\nimprove its performance. Manually re-tuning the model architecture to meet the\nperformance expectations on newer datasets is time-consuming and requires\ndomain expertise. We propose AutoML4ETC, a novel tool to automatically design\nefficient and high-performing neural architectures for encrypted traffic\nclassification. We define a novel, powerful search space tailored specifically\nfor the near real-time classification of encrypted traffic using packet header\nbytes. We show that with different search strategies over our search space,\nAutoML4ETC generates neural architectures that outperform the state-of-the-art\nencrypted traffic classifiers on several datasets, including public benchmark\ndatasets and real-world TLS and QUIC traffic collected from the Orange mobile\nnetwork. In addition to being more accurate, AutoML4ETC's architectures are\nsignificantly more efficient and lighter in terms of the number of parameters.\nFinally, we make AutoML4ETC publicly available for future research.\n","authors":["Navid Malekghaini","Elham Akbari","Mohammad A. Salahuddin","Noura Limam","Raouf Boutaba","Bertrand Mathieu","Stephanie Moteau","Stephane Tuffin"],"pdf_url":"https://arxiv.org/pdf/2308.02182v1.pdf","comment":"14 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.02180v1","updated":"2023-08-04T07:51:15Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n  Study in Oncology","summary":"  Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zheng","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v1.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n  (MLHC) 2023"},{"id":"http://arxiv.org/abs/2303.13035v3","updated":"2023-08-04T07:49:26Z","published":"2023-03-23T04:47:46Z","title":"SPeC: A Soft Prompt-Based Calibration on Performance Variability of\n  Large Language Model in Clinical Notes Summarization","summary":"  Electronic health records (EHRs) store an extensive array of patient\ninformation, encompassing medical histories, diagnoses, treatments, and test\noutcomes. These records are crucial for enabling healthcare providers to make\nwell-informed decisions regarding patient care. Summarizing clinical notes\nfurther assists healthcare professionals in pinpointing potential health risks\nand making better-informed decisions. This process contributes to reducing\nerrors and enhancing patient outcomes by ensuring providers have access to the\nmost pertinent and current patient data. Recent research has shown that\nincorporating prompts with large language models (LLMs) substantially boosts\nthe efficacy of summarization tasks. However, we show that this approach also\nleads to increased output variance, resulting in notably divergent outputs even\nwhen prompts share similar meanings. To tackle this challenge, we introduce a\nmodel-agnostic Soft Prompt-Based Calibration (SPeC) pipeline that employs soft\nprompts to diminish variance while preserving the advantages of prompt-based\nsummarization. Experimental findings on multiple clinical note tasks and LLMs\nindicate that our method not only bolsters performance but also effectively\ncurbs variance for various LLMs, providing a more uniform and dependable\nsolution for summarizing vital medical information.\n","authors":["Yu-Neng Chuang","Ruixiang Tang","Xiaoqian Jiang","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2303.13035v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08780v3","updated":"2023-08-04T06:55:24Z","published":"2023-02-17T09:42:38Z","title":"SE(3) symmetry lets graph neural networks learn arterial velocity\n  estimation from small datasets","summary":"  Hemodynamic velocity fields in coronary arteries could be the basis of\nvaluable biomarkers for diagnosis, prognosis and treatment planning in\ncardiovascular disease. Velocity fields are typically obtained from\npatient-specific 3D artery models via computational fluid dynamics (CFD).\nHowever, CFD simulation requires meticulous setup by experts and is\ntime-intensive, which hinders large-scale acceptance in clinical practice. To\naddress this, we propose graph neural networks (GNN) as an efficient black-box\nsurrogate method to estimate 3D velocity fields mapped to the vertices of\ntetrahedral meshes of the artery lumen. We train these GNNs on synthetic artery\nmodels and CFD-based ground truth velocity fields. Once the GNN is trained,\nvelocity estimates in a new and unseen artery can be obtained with 36-fold\nspeed-up compared to CFD. We demonstrate how to construct an SE(3)-equivariant\nGNN that is independent of the spatial orientation of the input mesh and show\nhow this reduces the necessary amount of training data compared to a baseline\nneural network.\n","authors":["Julian Suk","Christoph Brune","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2302.08780v3.pdf","comment":"First published in \"12th International Conference on Functional\n  Imaging and Modeling of the Heart\" (FIMH), pp 445-454, 2023 by Springer\n  Nature"},{"id":"http://arxiv.org/abs/2308.02165v1","updated":"2023-08-04T06:53:22Z","published":"2023-08-04T06:53:22Z","title":"Diffusion probabilistic models enhance variational autoencoder for\n  crystal structure generative modeling","summary":"  The crystal diffusion variational autoencoder (CDVAE) is a machine learning\nmodel that leverages score matching to generate realistic crystal structures\nthat preserve crystal symmetry. In this study, we leverage novel diffusion\nprobabilistic (DP) models to denoise atomic coordinates rather than adopting\nthe standard score matching approach in CDVAE. Our proposed DP-CDVAE model can\nreconstruct and generate crystal structures whose qualities are statistically\ncomparable to those of the original CDVAE. Furthermore, notably, when comparing\nthe carbon structures generated by the DP-CDVAE model with relaxed structures\nobtained from density functional theory calculations, we find that the DP-CDVAE\ngenerated structures are remarkably closer to their respective ground states.\nThe energy differences between these structures and the true ground states are,\non average, 68.1 meV/atom lower than those generated by the original CDVAE.\nThis significant improvement in the energy accuracy highlights the\neffectiveness of the DP-CDVAE model in generating crystal structures that\nbetter represent their ground-state configurations.\n","authors":["Teerachote Pakornchote","Natthaphon Choomphon-anomakhun","Sorrjit Arrerut","Chayanon Atthapak","Sakarn Khamkaeo","Thiparat Chotibut","Thiti Bovornratanaraks"],"pdf_url":"https://arxiv.org/pdf/2308.02165v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.02160v1","updated":"2023-08-04T06:37:34Z","published":"2023-08-04T06:37:34Z","title":"Speaker Diarization of Scripted Audiovisual Content","summary":"  The media localization industry usually requires a verbatim script of the\nfinal film or TV production in order to create subtitles or dubbing scripts in\na foreign language. In particular, the verbatim script (i.e. as-broadcast\nscript) must be structured into a sequence of dialogue lines each including\ntime codes, speaker name and transcript. Current speech recognition technology\nalleviates the transcription step. However, state-of-the-art speaker\ndiarization models still fall short on TV shows for two main reasons: (i) their\ninability to track a large number of speakers, (ii) their low accuracy in\ndetecting frequent speaker changes. To mitigate this problem, we present a\nnovel approach to leverage production scripts used during the shooting process,\nto extract pseudo-labeled data for the speaker diarization task. We propose a\nnovel semi-supervised approach and demonstrate improvements of 51.7% relative\nto two unsupervised baseline models on our metrics on a 66 show test set.\n","authors":["Yogesh Virkar","Brian Thompson","Rohit Paturi","Sundararajan Srinivasan","Marcello Federico"],"pdf_url":"https://arxiv.org/pdf/2308.02160v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.02157v1","updated":"2023-08-04T06:30:40Z","published":"2023-08-04T06:30:40Z","title":"Improved Order Analysis and Design of Exponential Integrator for\n  Diffusion Models Sampling","summary":"  Efficient differential equation solvers have significantly reduced the\nsampling time of diffusion models (DMs) while retaining high sampling quality.\nAmong these solvers, exponential integrators (EI) have gained prominence by\ndemonstrating state-of-the-art performance. However, existing high-order\nEI-based sampling algorithms rely on degenerate EI solvers, resulting in\ninferior error bounds and reduced accuracy in contrast to the theoretically\nanticipated results under optimal settings. This situation makes the sampling\nquality extremely vulnerable to seemingly innocuous design choices such as\ntimestep schedules. For example, an inefficient timestep scheduler might\nnecessitate twice the number of steps to achieve a quality comparable to that\nobtained through carefully optimized timesteps. To address this issue, we\nreevaluate the design of high-order differential solvers for DMs. Through a\nthorough order analysis, we reveal that the degeneration of existing high-order\nEI solvers can be attributed to the absence of essential order conditions. By\nreformulating the differential equations in DMs and capitalizing on the theory\nof exponential integrators, we propose refined EI solvers that fulfill all the\norder conditions, which we designate as Refined Exponential Solver (RES).\nUtilizing these improved solvers, RES exhibits more favorable error bounds\ntheoretically and achieves superior sampling efficiency and stability in\npractical applications. For instance, a simple switch from the single-step\nDPM-Solver++ to our order-satisfied RES solver when Number of Function\nEvaluations (NFE) $=9$, results in a reduction of numerical defects by $25.2\\%$\nand FID improvement of $25.4\\%$ (16.77 vs 12.51) on a pre-trained ImageNet\ndiffusion model.\n","authors":["Qinsheng Zhang","Jiaming Song","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2308.02157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19158v2","updated":"2023-08-04T06:29:20Z","published":"2023-05-30T15:59:56Z","title":"Competing for Shareable Arms in Multi-Player Multi-Armed Bandits","summary":"  Competitions for shareable and limited resources have long been studied with\nstrategic agents. In reality, agents often have to learn and maximize the\nrewards of the resources at the same time. To design an individualized\ncompeting policy, we model the competition between agents in a novel\nmulti-player multi-armed bandit (MPMAB) setting where players are selfish and\naim to maximize their own rewards. In addition, when several players pull the\nsame arm, we assume that these players averagely share the arms' rewards by\nexpectation. Under this setting, we first analyze the Nash equilibrium when\narms' rewards are known. Subsequently, we propose a novel Selfish MPMAB with\nAveraging Allocation (SMAA) approach based on the equilibrium. We theoretically\ndemonstrate that SMAA could achieve a good regret guarantee for each player\nwhen all players follow the algorithm. Additionally, we establish that no\nsingle selfish player can significantly increase their rewards through\ndeviation, nor can they detrimentally affect other players' rewards without\nincurring substantial losses for themselves. We finally validate the\neffectiveness of the method in extensive synthetic experiments.\n","authors":["Renzhe Xu","Haotian Wang","Xingxuan Zhang","Bo Li","Peng Cui"],"pdf_url":"https://arxiv.org/pdf/2305.19158v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2305.05318v2","updated":"2023-08-04T06:11:24Z","published":"2023-05-09T10:12:26Z","title":"How Informative is the Approximation Error from Tensor Decomposition for\n  Neural Network Compression?","summary":"  Tensor decompositions have been successfully applied to compress neural\nnetworks. The compression algorithms using tensor decompositions commonly\nminimize the approximation error on the weights. Recent work assumes the\napproximation error on the weights is a proxy for the performance of the model\nto compress multiple layers and fine-tune the compressed model. Surprisingly,\nlittle research has systematically evaluated which approximation errors can be\nused to make choices regarding the layer, tensor decomposition method, and\nlevel of compression. To close this gap, we perform an experimental study to\ntest if this assumption holds across different layers and types of\ndecompositions, and what the effect of fine-tuning is. We include the\napproximation error on the features resulting from a compressed layer in our\nanalysis to test if this provides a better proxy, as it explicitly takes the\ndata into account. We find the approximation error on the weights has a\npositive correlation with the performance error, before as well as after\nfine-tuning. Basing the approximation error on the features does not improve\nthe correlation significantly. While scaling the approximation error commonly\nis used to account for the different sizes of layers, the average correlation\nacross layers is smaller than across all choices (i.e. layers, decompositions,\nand level of compression) before fine-tuning. When calculating the correlation\nacross the different decompositions, the average rank correlation is larger\nthan across all choices. This means multiple decompositions can be considered\nfor compression and the approximation error can be used to choose between them.\n","authors":["Jetze T. Schuurmans","Kim Batselier","Julian F. P. Kooij"],"pdf_url":"https://arxiv.org/pdf/2305.05318v2.pdf","comment":"Published as a conference paper at ICLR 2023. Appendix A.5 was added\n  after the conference"},{"id":"http://arxiv.org/abs/2308.02145v1","updated":"2023-08-04T05:55:52Z","published":"2023-08-04T05:55:52Z","title":"Optimization on Pareto sets: On a theory of multi-objective optimization","summary":"  In multi-objective optimization, a single decision vector must balance the\ntrade-offs between many objectives. Solutions achieving an optimal trade-off\nare said to be Pareto optimal: these are decision vectors for which improving\nany one objective must come at a cost to another. But as the set of Pareto\noptimal vectors can be very large, we further consider a more practically\nsignificant Pareto-constrained optimization problem, where the goal is to\noptimize a preference function constrained to the Pareto set.\n  We investigate local methods for solving this constrained optimization\nproblem, which poses significant challenges because the constraint set is (i)\nimplicitly defined, and (ii) generally non-convex and non-smooth, even when the\nobjectives are. We define notions of optimality and stationarity, and provide\nan algorithm with a last-iterate convergence rate of $O(K^{-1/2})$ to\nstationarity when the objectives are strongly convex and Lipschitz smooth.\n","authors":["Abhishek Roy","Geelon So","Yi-An Ma"],"pdf_url":"https://arxiv.org/pdf/2308.02145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15299v2","updated":"2023-08-04T05:33:56Z","published":"2023-07-28T04:29:53Z","title":"Differential Evolution Algorithm based Hyper-Parameters Selection of\n  Transformer Neural Network Model for Load Forecasting","summary":"  Accurate load forecasting plays a vital role in numerous sectors, but\naccurately capturing the complex dynamics of dynamic power systems remains a\nchallenge for traditional statistical models. For these reasons, time-series\nmodels (ARIMA) and deep-learning models (ANN, LSTM, GRU, etc.) are commonly\ndeployed and often experience higher success. In this paper, we analyze the\nefficacy of the recently developed Transformer-based Neural Network model in\nLoad forecasting. Transformer models have the potential to improve Load\nforecasting because of their ability to learn long-range dependencies derived\nfrom their Attention Mechanism. We apply several metaheuristics namely\nDifferential Evolution to find the optimal hyperparameters of the\nTransformer-based Neural Network to produce accurate forecasts. Differential\nEvolution provides scalable, robust, global solutions to non-differentiable,\nmulti-objective, or constrained optimization problems. Our work compares the\nproposed Transformer based Neural Network model integrated with different\nmetaheuristic algorithms by their performance in Load forecasting based on\nnumerical metrics such as Mean Squared Error (MSE) and Mean Absolute Percentage\nError (MAPE). Our findings demonstrate the potential of metaheuristic-enhanced\nTransformer-based Neural Network models in Load forecasting accuracy and\nprovide optimal hyperparameters for each model.\n","authors":["Anuvab Sen","Arul Rhik Mazumder","Udayon Sen"],"pdf_url":"https://arxiv.org/pdf/2307.15299v2.pdf","comment":"6 Pages, 6 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2205.15076v2","updated":"2023-08-04T05:13:42Z","published":"2022-05-30T13:07:42Z","title":"Improved Algorithms for Bandit with Graph Feedback via Regret\n  Decomposition","summary":"  The problem of bandit with graph feedback generalizes both the multi-armed\nbandit (MAB) problem and the learning with expert advice problem by encoding in\na directed graph how the loss vector can be observed in each round of the game.\nThe mini-max regret is closely related to the structure of the feedback graph\nand their connection is far from being fully understood. We propose a new\nalgorithmic framework for the problem based on a partition of the feedback\ngraph. Our analysis reveals the interplay between various parts of the graph by\ndecomposing the regret to the sum of the regret caused by small parts and the\nregret caused by their interaction. As a result, our algorithm can be viewed as\nan interpolation and generalization of the optimal algorithms for MAB and\nlearning with expert advice. Our framework unifies previous algorithms for both\nstrongly observable graphs and weakly observable graphs, resulting in improved\nand optimal regret bounds on a wide range of graph families including graphs of\nbounded degree and strongly observable graphs with a few corrupted arms.\n","authors":["Yuchen He","Chihao Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.15076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02137v1","updated":"2023-08-04T05:09:06Z","published":"2023-08-04T05:09:06Z","title":"Learning the solution operator of two-dimensional incompressible\n  Navier-Stokes equations using physics-aware convolutional neural networks","summary":"  In recent years, the concept of introducing physics to machine learning has\nbecome widely popular. Most physics-inclusive ML-techniques however are still\nlimited to a single geometry or a set of parametrizable geometries. Thus, there\nremains the need to train a new model for a new geometry, even if it is only\nslightly modified. With this work we introduce a technique with which it is\npossible to learn approximate solutions to the steady-state Navier--Stokes\nequations in varying geometries without the need of parametrization. This\ntechnique is based on a combination of a U-Net-like CNN and well established\ndiscretization methods from the field of the finite difference method.The\nresults of our physics-aware CNN are compared to a state-of-the-art data-based\napproach. Additionally, it is also shown how our approach performs when\ncombined with the data-based approach.\n","authors":["Viktor Grimm","Alexander Heinlein","Axel Klawonn"],"pdf_url":"https://arxiv.org/pdf/2308.02137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09034v4","updated":"2023-08-04T05:08:44Z","published":"2022-12-18T08:17:32Z","title":"Graph Neural Networks are Inherently Good Generalizers: Insights by\n  Bridging GNNs and MLPs","summary":"  Graph neural networks (GNNs), as the de-facto model class for representation\nlearning on graphs, are built upon the multi-layer perceptrons (MLP)\narchitecture with additional message passing layers to allow features to flow\nacross nodes. While conventional wisdom commonly attributes the success of GNNs\nto their advanced expressivity, we conjecture that this is not the main cause\nof GNNs' superiority in node-level prediction tasks. This paper pinpoints the\nmajor source of GNNs' performance gain to their intrinsic generalization\ncapability, by introducing an intermediate model class dubbed as\nP(ropagational)MLP, which is identical to standard MLP in training, but then\nadopts GNN's architecture in testing. Intriguingly, we observe that PMLPs\nconsistently perform on par with (or even exceed) their GNN counterparts, while\nbeing much more efficient in training. This finding sheds new insights into\nunderstanding the learning behavior of GNNs, and can be used as an analytic\ntool for dissecting various GNN-related research problems. As an initial step\nto analyze the inherent generalizability of GNNs, we show the essential\ndifference between MLP and PMLP at infinite-width limit lies in the NTK feature\nmap in the post-training stage. Moreover, by examining their extrapolation\nbehavior, we find that though many GNNs and their PMLP counterparts cannot\nextrapolate non-linear functions for extremely out-of-distribution samples,\nthey have greater potential to generalize to testing samples near the training\ndata range as natural advantages of GNN architectures.\n","authors":["Chenxiao Yang","Qitian Wu","Jiahua Wang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2212.09034v4.pdf","comment":"Accepted to ICLR 2023. Codes in https://github.com/chr26195/PMLP"},{"id":"http://arxiv.org/abs/2307.07264v2","updated":"2023-08-04T05:07:47Z","published":"2023-07-14T10:38:30Z","title":"On Interpolating Experts and Multi-Armed Bandits","summary":"  Learning with expert advice and multi-armed bandit are two classic online\ndecision problems which differ on how the information is observed in each round\nof the game. We study a family of problems interpolating the two. For a vector\n$\\mathbf{m}=(m_1,\\dots,m_K)\\in \\mathbb{N}^K$, an instance of $\\mathbf{m}$-MAB\nindicates that the arms are partitioned into $K$ groups and the $i$-th group\ncontains $m_i$ arms. Once an arm is pulled, the losses of all arms in the same\ngroup are observed. We prove tight minimax regret bounds for $\\mathbf{m}$-MAB\nand design an optimal PAC algorithm for its pure exploration version,\n$\\mathbf{m}$-BAI, where the goal is to identify the arm with minimum loss with\nas few rounds as possible. We show that the minimax regret of $\\mathbf{m}$-MAB\nis $\\Theta\\left(\\sqrt{T\\sum_{k=1}^K\\log (m_k+1)}\\right)$ and the minimum number\nof pulls for an $(\\epsilon,0.05)$-PAC algorithm of $\\mathbf{m}$-BAI is\n$\\Theta\\left(\\frac{1}{\\epsilon^2}\\cdot \\sum_{k=1}^K\\log (m_k+1)\\right)$. Both\nour upper bounds and lower bounds for $\\mathbf{m}$-MAB can be extended to a\nmore general setting, namely the bandit with graph feedback, in terms of the\nclique cover and related graph parameters. As consequences, we obtained tight\nminimax regret bounds for several families of feedback graphs.\n","authors":["Houshuang Chen","Yuchen He","Chihao Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.07264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15374v2","updated":"2023-08-04T05:05:40Z","published":"2023-06-27T10:46:36Z","title":"LeCo: Lightweight Compression via Learning Serial Correlations","summary":"  Lightweight data compression is a key technique that allows column stores to\nexhibit superior performance for analytical queries. Despite a comprehensive\nstudy on dictionary-based encodings to approach Shannon's entropy, few prior\nworks have systematically exploited the serial correlation in a column for\ncompression. In this paper, we propose LeCo (i.e., Learned Compression), a\nframework that uses machine learning to remove the serial redundancy in a value\nsequence automatically to achieve an outstanding compression ratio and\ndecompression performance simultaneously. LeCo presents a general approach to\nthis end, making existing (ad-hoc) algorithms such as Frame-of-Reference (FOR),\nDelta Encoding, and Run-Length Encoding (RLE) special cases under our\nframework. Our microbenchmark with three synthetic and six real-world data sets\nshows that a prototype of LeCo achieves a Pareto improvement on both\ncompression ratio and random access speed over the existing solutions. When\nintegrating LeCo into widely-used applications, we observe up to 3.9x speed up\nin filter-scanning a Parquet file and a 16% increase in Rocksdb's throughput.\n","authors":["Yihao Liu","Xinyu Zeng","Huanchen Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.15374v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05722v2","updated":"2023-08-04T04:42:07Z","published":"2023-06-09T07:38:38Z","title":"Ridge Estimation with Nonlinear Transformations","summary":"  Ridge estimation is an important manifold learning technique. The goal of\nthis paper is to examine the effects of nonlinear transformations on the ridge\nsets. The main result proves the inclusion relationship between ridges:\n$\\cR(f\\circ p)\\subseteq \\cR(p)$, provided that the transformation $f$ is\nstrictly increasing and concave on the range of the function $p$. Additionally,\ngiven an underlying true manifold $\\cM$, we show that the Hausdorff distance\nbetween $\\cR(f\\circ p)$ and its projection onto $\\cM$ is smaller than the\nHausdorff distance between $\\cR(p)$ and the corresponding projection. This\nmotivates us to apply an increasing and concave transformation before the ridge\nestimation. In specific, we show that the power transformations\n$f^{q}(y)=y^q/q,-\\infty<q\\leq 1$ are increasing and concave on $\\RR_+$, and\nthus we can use such power transformations when $p$ is strictly positive.\nNumerical experiments demonstrate the advantages of the proposed methods.\n","authors":["Zheng Zhai","Hengchao Chen","Zhigang Yao"],"pdf_url":"https://arxiv.org/pdf/2306.05722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02123v1","updated":"2023-08-04T03:51:38Z","published":"2023-08-04T03:51:38Z","title":"Eva: A General Vectorized Approximation Framework for Second-order\n  Optimization","summary":"  Second-order optimization algorithms exhibit excellent convergence properties\nfor training deep learning models, but often incur significant computation and\nmemory overheads. This can result in lower training efficiency than the\nfirst-order counterparts such as stochastic gradient descent (SGD). In this\nwork, we present a memory- and time-efficient second-order algorithm named Eva\nwith two novel techniques: 1) we construct the second-order information with\nthe Kronecker factorization of small stochastic vectors over a mini-batch of\ntraining data to reduce memory consumption, and 2) we derive an efficient\nupdate formula without explicitly computing the inverse of matrices using the\nSherman-Morrison formula. We further extend Eva to a general vectorized\napproximation framework to improve the compute and memory efficiency of two\nexisting second-order algorithms (FOOF and Shampoo) without affecting their\nconvergence performance. Extensive experimental results on different models and\ndatasets show that Eva reduces the end-to-end training time up to 2.05x and\n2.42x compared to first-order SGD and second-order algorithms (K-FAC and\nShampoo), respectively.\n","authors":["Lin Zhang","Shaohuai Shi","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2308.02123v1.pdf","comment":"Extension of ICLR2022 Practical second-order optimization with\n  Kronecker-vectorized approximation"},{"id":"http://arxiv.org/abs/2308.02121v1","updated":"2023-08-04T03:46:41Z","published":"2023-08-04T03:46:41Z","title":"Model Provenance via Model DNA","summary":"  Understanding the life cycle of the machine learning (ML) model is an\nintriguing area of research (e.g., understanding where the model comes from,\nhow it is trained, and how it is used). This paper focuses on a novel problem\nwithin this field, namely Model Provenance (MP), which concerns the\nrelationship between a target model and its pre-training model and aims to\ndetermine whether a source model serves as the provenance for a target model.\nThis is an important problem that has significant implications for ensuring the\nsecurity and intellectual property of machine learning models but has not\nreceived much attention in the literature. To fill in this gap, we introduce a\nnovel concept of Model DNA which represents the unique characteristics of a\nmachine learning model. We utilize a data-driven and model-driven\nrepresentation learning method to encode the model's training data and\ninput-output information as a compact and comprehensive representation (i.e.,\nDNA) of the model. Using this model DNA, we develop an efficient framework for\nmodel provenance identification, which enables us to identify whether a source\nmodel is a pre-training model of a target model. We conduct evaluations on both\ncomputer vision and natural language processing tasks using various models,\ndatasets, and scenarios to demonstrate the effectiveness of our approach in\naccurately identifying model provenance.\n","authors":["Xin Mu","Yu Wang","Yehong Zhang","Jiaqi Zhang","Hui Wang","Yang Xiang","Yue Yu"],"pdf_url":"https://arxiv.org/pdf/2308.02121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.00596v3","updated":"2023-08-04T03:12:41Z","published":"2020-07-01T16:32:22Z","title":"A New Basis for Sparse Principal Component Analysis","summary":"  Previous versions of sparse principal component analysis (PCA) have presumed\nthat the eigen-basis (a $p \\times k$ matrix) is approximately sparse. We\npropose a method that presumes the $p \\times k$ matrix becomes approximately\nsparse after a $k \\times k$ rotation. The simplest version of the algorithm\ninitializes with the leading $k$ principal components. Then, the principal\ncomponents are rotated with an $k \\times k$ orthogonal rotation to make them\napproximately sparse. Finally, soft-thresholding is applied to the rotated\nprincipal components. This approach differs from prior approaches because it\nuses an orthogonal rotation to approximate a sparse basis. One consequence is\nthat a sparse component need not to be a leading eigenvector, but rather a\nmixture of them. In this way, we propose a new (rotated) basis for sparse PCA.\nIn addition, our approach avoids \"deflation\" and multiple tuning parameters\nrequired for that. Our sparse PCA framework is versatile; for example, it\nextends naturally to a two-way analysis of a data matrix for simultaneous\ndimensionality reduction of rows and columns. We provide evidence showing that\nfor the same level of sparsity, the proposed sparse PCA method is more stable\nand can explain more variance compared to alternative methods. Through three\napplications -- sparse coding of images, analysis of transcriptome sequencing\ndata, and large-scale clustering of social networks, we demonstrate the modern\nusefulness of sparse PCA in exploring multivariate data.\n","authors":["Fan Chen","Karl Rohe"],"pdf_url":"https://arxiv.org/pdf/2007.00596v3.pdf","comment":"50 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.06125v2","updated":"2023-08-04T03:06:16Z","published":"2023-06-08T06:15:17Z","title":"Joint Channel Estimation and Feedback with Masked Token Transformers in\n  Massive MIMO Systems","summary":"  The downlink channel state information (CSI) estimation and low overhead\nacquisition are the major challenges for massive MIMO systems in frequency\ndivision duplex to enable high MIMO gain. Recently, numerous studies have been\nconducted to harness the power of deep neural networks for better channel\nestimation and feedback. However, existing methods have yet to fully exploit\nthe intrinsic correlation features present in CSI. As a consequence, distinct\nnetwork structures are utilized for handling these two tasks separately. To\nachieve joint channel estimation and feedback, this paper proposes an\nencoder-decoder based network that unveils the intrinsic frequency-domain\ncorrelation within the CSI matrix. The entire encoder-decoder network is\nutilized for channel compression. To effectively capture and restructure\ncorrelation features, a self-mask-attention coding is proposed, complemented by\nan active masking strategy designed to improve efficiency. The channel\nestimation is achieved through the decoder part, wherein a lightweight\nmultilayer perceptron denoising module is utilized for further accurate\nestimation. Extensive experiments demonstrate that our method not only\noutperforms state-of-the-art channel estimation and feedback techniques in\njoint tasks but also achieves beneficial performance in individual tasks.\n","authors":["Mingming Zhao","Lin Liu","Lifu Liu","Mengke Li","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2306.06125v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.02117v1","updated":"2023-08-04T02:58:08Z","published":"2023-08-04T02:58:08Z","title":"VQGraph: Graph Vector-Quantization for Bridging GNNs and MLPs","summary":"  Graph Neural Networks (GNNs) conduct message passing which aggregates local\nneighbors to update node representations. Such message passing leads to\nscalability issues in practical latency-constrained applications. To address\nthis issue, recent methods adopt knowledge distillation (KD) to learn\ncomputationally-efficient multi-layer perceptron (MLP) by mimicking the output\nof GNN. However, the existing GNN representation space may not be expressive\nenough for representing diverse local structures of the underlying graph, which\nlimits the knowledge transfer from GNN to MLP. Here we present a novel\nframework VQGraph to learn a powerful graph representation space for bridging\nGNNs and MLPs. We adopt the encoder of a variant of a vector-quantized\nvariational autoencoder (VQ-VAE) as a structure-aware graph tokenizer, which\nexplicitly represents the nodes of diverse local structures as numerous\ndiscrete tokens and constitutes a meaningful codebook. Equipped with the\nlearned codebook, we propose a new token-based distillation objective based on\nsoft token assignments to sufficiently transfer the structural knowledge from\nGNN to MLP. Extensive experiments and analyses demonstrate the strong\nperformance of VQGraph, where we achieve new state-of-the-art performance on\nGNN-MLP distillation in both transductive and inductive settings across seven\ngraph datasets. We show that VQGraph with better performance infers faster than\nGNNs by 828x, and also achieves accuracy improvement over GNNs and stand-alone\nMLPs by 3.90% and 28.05% on average, respectively. Code:\nhttps://github.com/YangLing0818/VQGraph.\n","authors":["Ling Yang","Ye Tian","Minkai Xu","Zhongyi Liu","Shenda Hong","Wei Qu","Wentao Zhang","Bin Cui","Muhan Zhang","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2308.02117v1.pdf","comment":"arXiv admin note: text overlap with arXiv:1906.00446 by other authors"},{"id":"http://arxiv.org/abs/2304.02978v2","updated":"2023-08-04T02:29:24Z","published":"2023-04-06T10:05:54Z","title":"Simplifying Low-Light Image Enhancement Networks with Relative Loss\n  Functions","summary":"  Image enhancement is a common technique used to mitigate issues such as\nsevere noise, low brightness, low contrast, and color deviation in low-light\nimages. However, providing an optimal high-light image as a reference for\nlow-light image enhancement tasks is impossible, which makes the learning\nprocess more difficult than other image processing tasks. As a result, although\nseveral low-light image enhancement methods have been proposed, most of them\nare either too complex or insufficient in addressing all the issues in\nlow-light images. In this paper, to make the learning easier in low-light image\nenhancement, we introduce FLW-Net (Fast and LightWeight Network) and two\nrelative loss functions. Specifically, we first recognize the challenges of the\nneed for a large receptive field to obtain global contrast and the lack of an\nabsolute reference, which limits the simplification of network structures in\nthis task. Then, we propose an efficient global feature information extraction\ncomponent and two loss functions based on relative information to overcome\nthese challenges. Finally, we conducted comparative experiments to demonstrate\nthe effectiveness of the proposed method, and the results confirm that the\nproposed method can significantly reduce the complexity of supervised low-light\nimage enhancement networks while improving processing effect. The code is\navailable at \\url{https://github.com/hitzhangyu/FLW-Net}.\n","authors":["Yu Zhang","Xiaoguang Di","Junde Wu","Rao Fu","Yong Li","Yue Wang","Yanwu Xu","Guohui Yang","Chunhui Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02978v2.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.01867v2","updated":"2023-08-04T02:21:38Z","published":"2023-08-01T08:15:30Z","title":"MRQ:Support Multiple Quantization Schemes through Model Re-Quantization","summary":"  Despite the proliferation of diverse hardware accelerators (e.g., NPU, TPU,\nDPU), deploying deep learning models on edge devices with fixed-point hardware\nis still challenging due to complex model quantization and conversion. Existing\nmodel quantization frameworks like Tensorflow QAT [1], TFLite PTQ [2], and\nQualcomm AIMET [3] supports only a limited set of quantization schemes (e.g.,\nonly asymmetric per-tensor quantization in TF1.x QAT [4]). Accordingly, deep\nlearning models cannot be easily quantized for diverse fixed-point hardwares,\nmainly due to slightly different quantization requirements. In this paper, we\nenvision a new type of model quantization approach called MRQ (model\nre-quantization), which takes existing quantized models and quickly transforms\nthe models to meet different quantization requirements (e.g., asymmetric ->\nsymmetric, non-power-of-2 scale -> power-of-2 scale). Re-quantization is much\nsimpler than quantizing from scratch because it avoids costly re-training and\nprovides support for multiple quantization schemes simultaneously. To minimize\nre-quantization error, we developed a new set of re-quantization algorithms\nincluding weight correction and rounding error folding. We have demonstrated\nthat MobileNetV2 QAT model [7] can be quickly re-quantized into two different\nquantization schemes (i.e., symmetric and symmetric+power-of-2 scale) with less\nthan 0.64 units of accuracy loss. We believe our work is the first to leverage\nthis concept of re-quantization for model quantization and models obtained from\nthe re-quantization process have been successfully deployed on NNA in the Echo\nShow devices.\n","authors":["Manasa Manohara","Sankalp Dayal","Tariq Afzal","Rahul Bakshi","Kahkuen Fu"],"pdf_url":"https://arxiv.org/pdf/2308.01867v2.pdf","comment":"8 pages, 6 figures, 3 tables, TinyML Conference"},{"id":"http://arxiv.org/abs/2308.02101v1","updated":"2023-08-04T01:19:32Z","published":"2023-08-04T01:19:32Z","title":"Breast Ultrasound Tumor Classification Using a Hybrid Multitask\n  CNN-Transformer Network","summary":"  Capturing global contextual information plays a critical role in breast\nultrasound (BUS) image classification. Although convolutional neural networks\n(CNNs) have demonstrated reliable performance in tumor classification, they\nhave inherent limitations for modeling global and long-range dependencies due\nto the localized nature of convolution operations. Vision Transformers have an\nimproved capability of capturing global contextual information but may distort\nthe local image patterns due to the tokenization operations. In this study, we\nproposed a hybrid multitask deep neural network called Hybrid-MT-ESTAN,\ndesigned to perform BUS tumor classification and segmentation using a hybrid\narchitecture composed of CNNs and Swin Transformer components. The proposed\napproach was compared to nine BUS classification methods and evaluated using\nseven quantitative metrics on a dataset of 3,320 BUS images. The results\nindicate that Hybrid-MT-ESTAN achieved the highest accuracy, sensitivity, and\nF1 score of 82.7%, 86.4%, and 86.0%, respectively.\n","authors":["Bryar Shareef","Min Xian","Aleksandar Vakanski","Haotian Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02101v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2304.09981v2","updated":"2023-08-04T01:03:42Z","published":"2023-04-19T21:33:51Z","title":"Interpretable (not just posthoc-explainable) heterogeneous survivor\n  bias-corrected treatment effects for assignment of postdischarge\n  interventions to prevent readmissions","summary":"  We used survival analysis to quantify the impact of postdischarge evaluation\nand management (E/M) services in preventing hospital readmission or death. Our\napproach avoids a specific pitfall of applying machine learning to this\nproblem, which is an inflated estimate of the effect of interventions, due to\nsurvivors bias -- where the magnitude of inflation may be conditional on\nheterogeneous confounders in the population. This bias arises simply because in\norder to receive an intervention after discharge, a person must not have been\nreadmitted in the intervening period. After deriving an expression for this\nphantom effect, we controlled for this and other biases within an inherently\ninterpretable Bayesian survival framework. We identified case management\nservices as being the most impactful for reducing readmissions overall.\n","authors":["Hongjing Xia","Joshua C. Chang","Sarah Nowak","Sonya Mahajan","Rohit Mahajan","Ted L. Chang","Carson C. Chow"],"pdf_url":"https://arxiv.org/pdf/2304.09981v2.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2308.01404v2","updated":"2023-08-04T00:57:06Z","published":"2023-07-05T17:22:09Z","title":"Hoodwinked: Deception and Cooperation in a Text-Based Game for Language\n  Models","summary":"  Are current language models capable of deception and lie detection? We study\nthis question by introducing a text-based game called $\\textit{Hoodwinked}$,\ninspired by Mafia and Among Us. Players are locked in a house and must find a\nkey to escape, but one player is tasked with killing the others. Each time a\nmurder is committed, the surviving players have a natural language discussion\nthen vote to banish one player from the game. We conduct experiments with\nagents controlled by GPT-3, GPT-3.5, and GPT-4 and find evidence of deception\nand lie detection capabilities. The killer often denies their crime and accuses\nothers, leading to measurable effects on voting outcomes. More advanced models\nare more effective killers, outperforming smaller models in 18 of 24 pairwise\ncomparisons. Secondary metrics provide evidence that this improvement is not\nmediated by different actions, but rather by stronger persuasive skills during\ndiscussions. To evaluate the ability of AI agents to deceive humans, we make\nthis game publicly available at h https://hoodwinked.ai/ .\n","authors":["Aidan O'Gara"],"pdf_url":"https://arxiv.org/pdf/2308.01404v2.pdf","comment":"Added reference for McKenzie 2023; updated acknowledgements"}],"Multimedia":[{"id":"http://arxiv.org/abs/2211.11248v2","updated":"2023-08-04T15:57:36Z","published":"2022-11-21T08:39:48Z","title":"Video Background Music Generation: Dataset, Method and Evaluation","summary":"  Music is essential when editing videos, but selecting music manually is\ndifficult and time-consuming. Thus, we seek to automatically generate\nbackground music tracks given video input. This is a challenging task since it\nrequires music-video datasets, efficient architectures for video-to-music\ngeneration, and reasonable metrics, none of which currently exist. To close\nthis gap, we introduce a complete recipe including dataset, benchmark model,\nand evaluation metric for video background music generation. We present SymMV,\na video and symbolic music dataset with various musical annotations. To the\nbest of our knowledge, it is the first video-music dataset with rich musical\nannotations. We also propose a benchmark video background music generation\nframework named V-MusProd, which utilizes music priors of chords, melody, and\naccompaniment along with video-music relations of semantic, color, and motion\nfeatures. To address the lack of objective metrics for video-music\ncorrespondence, we design a retrieval-based metric VMCP built upon a powerful\nvideo-music representation learning model. Experiments show that with our\ndataset, V-MusProd outperforms the state-of-the-art method in both music\nquality and correspondence with videos. We believe our dataset, benchmark\nmodel, and evaluation metric will boost the development of video background\nmusic generation. Our dataset and code are available at\nhttps://github.com/zhuole1025/SymMV.\n","authors":["Le Zhuo","Zhaokai Wang","Baisen Wang","Yue Liao","Chenxi Bao","Stanley Peng","Songhao Han","Aixi Zhang","Fei Fang","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2211.11248v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2303.11591v2","updated":"2023-08-04T14:15:39Z","published":"2023-03-21T04:42:39Z","title":"SVCNet: Scribble-based Video Colorization Network with Temporal\n  Aggregation","summary":"  In this paper, we propose a scribble-based video colorization network with\ntemporal aggregation called SVCNet. It can colorize monochrome videos based on\ndifferent user-given color scribbles. It addresses three common issues in the\nscribble-based video colorization area: colorization vividness, temporal\nconsistency, and color bleeding. To improve the colorization quality and\nstrengthen the temporal consistency, we adopt two sequential sub-networks in\nSVCNet for precise colorization and temporal smoothing, respectively. The first\nstage includes a pyramid feature encoder to incorporate color scribbles with a\ngrayscale frame, and a semantic feature encoder to extract semantics. The\nsecond stage finetunes the output from the first stage by aggregating the\ninformation of neighboring colorized frames (as short-range connections) and\nthe first colorized frame (as a long-range connection). To alleviate the color\nbleeding artifacts, we learn video colorization and segmentation\nsimultaneously. Furthermore, we set the majority of operations on a fixed small\nimage resolution and use a Super-resolution Module at the tail of SVCNet to\nrecover original sizes. It allows the SVCNet to fit different image resolutions\nat the inference. Finally, we evaluate the proposed SVCNet on DAVIS and Videvo\nbenchmarks. The experimental results demonstrate that SVCNet produces both\nhigher-quality and more temporally consistent videos than other well-known\nvideo colorization approaches. The codes and models can be found at\nhttps://github.com/zhaoyuzhi/SVCNet.\n","authors":["Yuzhi Zhao","Lai-Man Po","Kangcheng Liu","Xuehui Wang","Wing-Yin Yu","Pengfei Xian","Yujia Zhang","Mengyang Liu"],"pdf_url":"https://arxiv.org/pdf/2303.11591v2.pdf","comment":"accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2308.01634v2","updated":"2023-08-04T13:22:08Z","published":"2023-08-03T09:09:28Z","title":"Disentangling Multi-view Representations Beyond Inductive Bias","summary":"  Multi-view (or -modality) representation learning aims to understand the\nrelationships between different view representations. Existing methods\ndisentangle multi-view representations into consistent and view-specific\nrepresentations by introducing strong inductive biases, which can limit their\ngeneralization ability. In this paper, we propose a novel multi-view\nrepresentation disentangling method that aims to go beyond inductive biases,\nensuring both interpretability and generalizability of the resulting\nrepresentations. Our method is based on the observation that discovering\nmulti-view consistency in advance can determine the disentangling information\nboundary, leading to a decoupled learning objective. We also found that the\nconsistency can be easily extracted by maximizing the transformation invariance\nand clustering consistency between views. These observations drive us to\npropose a two-stage framework. In the first stage, we obtain multi-view\nconsistency by training a consistent encoder to produce semantically-consistent\nrepresentations across views as well as their corresponding pseudo-labels. In\nthe second stage, we disentangle specificity from comprehensive representations\nby minimizing the upper bound of mutual information between consistent and\ncomprehensive representations. Finally, we reconstruct the original data by\nconcatenating pseudo-labels and view-specific representations. Our experiments\non four multi-view datasets demonstrate that our proposed method outperforms 12\ncomparison methods in terms of clustering and classification performance. The\nvisualization results also show that the extracted consistency and specificity\nare compact and interpretable. Our code can be found at\n\\url{https://github.com/Guanzhou-Ke/DMRIB}.\n","authors":["Guanzhou Ke","Yang Yu","Guoqing Chao","Xiaoli Wang","Chenyang Xu","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.01634v2.pdf","comment":"9 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.02173v1","updated":"2023-08-04T07:19:08Z","published":"2023-08-04T07:19:08Z","title":"Efficient Labelling of Affective Video Datasets via Few-Shot &\n  Multi-Task Contrastive Learning","summary":"  Whilst deep learning techniques have achieved excellent emotion prediction,\nthey still require large amounts of labelled training data, which are (a)\nonerous and tedious to compile, and (b) prone to errors and biases. We propose\nMulti-Task Contrastive Learning for Affect Representation (\\textbf{MT-CLAR})\nfor few-shot affect inference. MT-CLAR combines multi-task learning with a\nSiamese network trained via contrastive learning to infer from a pair of\nexpressive facial images (a) the (dis)similarity between the facial\nexpressions, and (b) the difference in valence and arousal levels of the two\nfaces. We further extend the image-based MT-CLAR framework for automated video\nlabelling where, given one or a few labelled video frames (termed\n\\textit{support-set}), MT-CLAR labels the remainder of the video for valence\nand arousal. Experiments are performed on the AFEW-VA dataset with multiple\nsupport-set configurations; moreover, supervised learning on representations\nlearnt via MT-CLAR are used for valence, arousal and categorical emotion\nprediction on the AffectNet and AFEW-VA datasets. The results show that valence\nand arousal predictions via MT-CLAR are very comparable to the state-of-the-art\n(SOTA), and we significantly outperform SOTA with a support-set $\\approx$6\\%\nthe size of the video dataset.\n","authors":["Ravikiran Parameshwara","Ibrahim Radwan","Akshay Asthana","Iman Abbasnejad","Ramanathan Subramanian","Roland Goecke"],"pdf_url":"https://arxiv.org/pdf/2308.02173v1.pdf","comment":"10 pages, 6 figures, to be published in Proceedings of the 31st ACM\n  International Conference on Multimedia (MM '23)"},{"id":"http://arxiv.org/abs/2308.02723v1","updated":"2023-08-04T21:59:40Z","published":"2023-08-04T21:59:40Z","title":"Towards Improving Harmonic Sensitivity and Prediction Stability for\n  Singing Melody Extraction","summary":"  In deep learning research, many melody extraction models rely on redesigning\nneural network architectures to improve performance. In this paper, we propose\nan input feature modification and a training objective modification based on\ntwo assumptions. First, harmonics in the spectrograms of audio data decay\nrapidly along the frequency axis. To enhance the model's sensitivity on the\ntrailing harmonics, we modify the Combined Frequency and Periodicity (CFP)\nrepresentation using discrete z-transform. Second, the vocal and non-vocal\nsegments with extremely short duration are uncommon. To ensure a more stable\nmelody contour, we design a differentiable loss function that prevents the\nmodel from predicting such segments. We apply these modifications to several\nmodels, including MSNet, FTANet, and a newly introduced model, PianoNet,\nmodified from a piano transcription network. Our experimental results\ndemonstrate that the proposed modifications are empirically effective for\nsinging melody extraction.\n","authors":["Keren Shao","Ke Chen","Taylor Berg-Kirkpatrick","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2308.02723v1.pdf","comment":"7 pages, 4 figures, 2 tables, Proceedings of the 24th International\n  Society for Music Information Retrieval Conference, ISMIR 2023"}]},"2023-08-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2308.03742v1","updated":"2023-08-07T17:46:49Z","published":"2023-08-07T17:46:49Z","title":"What about translation? New coding system for content analysis on the\n  perception of literary translation around the political transformation in\n  1989 in Hungary as a classification problem on an unbalanced dataset","summary":"  To track trends in the perception of literary translation around the\npolitical transformation in 1989 in Hungary, a coding system was developed on\nthe paragraphs of the 1980-1999 issues of the literary journal Alf\\\"old. This\npaper describes how we trained BERT models to carry over the coding system to\nthe 1980-1999 issues of the literary journal Nagyvil\\'ag. We use extensive\nhyperparameter tuning, loss functions robust to label unbalance, 10-fold\ncross-validation for precise evaluations and a model ensemble for prediction,\nmanual validation on the predict set, a new calibration method to better\npredict label counts for sections of the Nagyvil\\'ag corpus, and to study the\nrelations between labels, we construct label relation networks.\n","authors":["Dalma Galambos","Pál Zsámboki"],"pdf_url":"https://arxiv.org/pdf/2308.03742v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2301.09656v3","updated":"2023-08-07T17:40:40Z","published":"2023-01-23T19:00:02Z","title":"Selective Explanations: Leveraging Human Input to Align Explainable AI","summary":"  While a vast collection of explainable AI (XAI) algorithms have been\ndeveloped in recent years, they are often criticized for significant gaps with\nhow humans produce and consume explanations. As a result, current XAI\ntechniques are often found to be hard to use and lack effectiveness. In this\nwork, we attempt to close these gaps by making AI explanations selective -- a\nfundamental property of human explanations -- by selectively presenting a\nsubset from a large set of model reasons based on what aligns with the\nrecipient's preferences. We propose a general framework for generating\nselective explanations by leveraging human input on a small sample. This\nframework opens up a rich design space that accounts for different selectivity\ngoals, types of input, and more. As a showcase, we use a decision-support task\nto explore selective explanations based on what the decision-maker would\nconsider relevant to the decision task. We conducted two experimental studies\nto examine three out of a broader possible set of paradigms based on our\nproposed framework: in Study 1, we ask the participants to provide their own\ninput to generate selective explanations, with either open-ended or\ncritique-based input. In Study 2, we show participants selective explanations\nbased on input from a panel of similar users (annotators). Our experiments\ndemonstrate the promise of selective explanations in reducing over-reliance on\nAI and improving decision outcomes and subjective perceptions of the AI, but\nalso paint a nuanced picture that attributes some of these positive effects to\nthe opportunity to provide one's own input to augment AI explanations. Overall,\nour work proposes a novel XAI framework inspired by human communication\nbehaviors and demonstrates its potentials to encourage future work to better\nalign AI explanations with human production and consumption of explanations.\n","authors":["Vivian Lai","Yiming Zhang","Chacha Chen","Q. Vera Liao","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2301.09656v3.pdf","comment":"21 pages, 25 figures"},{"id":"http://arxiv.org/abs/2307.14361v2","updated":"2023-08-07T17:09:07Z","published":"2023-07-24T21:01:46Z","title":"A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer\n  using LSTM, BiLSTM, CNN, GRU, and GloVe","summary":"  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and\nGloVe to classify gene mutations using Kaggle's Personalized Medicine:\nRedefining Cancer Treatment dataset. The results were compared against\nwell-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and\ntheir LSTM ensembles. Our model outperformed all other models in terms of\naccuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it\nalso needed less training time, resulting in a perfect combination of\nperformance and efficiency. This study demonstrates the utility of ensemble\nmodels for difficult tasks such as gene mutation classification.\n","authors":["Sanad Aburass","Osama Dorgham","Jamil Al Shaqsi"],"pdf_url":"https://arxiv.org/pdf/2307.14361v2.pdf","comment":"6 pages, 7 figures and 2 tables"},{"id":"http://arxiv.org/abs/2308.03688v1","updated":"2023-08-07T16:08:11Z","published":"2023-08-07T16:08:11Z","title":"AgentBench: Evaluating LLMs as Agents","summary":"  Large Language Models (LLMs) are becoming increasingly smart and autonomous,\ntargeting real-world pragmatic missions beyond traditional NLP tasks. As a\nresult, there has been an urgent need to evaluate LLMs as agents on challenging\ntasks in interactive environments. We present AgentBench, a multi-dimensional\nevolving benchmark that currently consists of 8 distinct environments to assess\nLLM-as-Agent's reasoning and decision-making abilities in a multi-turn\nopen-ended generation setting. Our extensive test over 25 LLMs (including APIs\nand open-sourced models) shows that, while top commercial LLMs present a strong\nability of acting as agents in complex environments, there is a significant\ndisparity in performance between them and open-sourced competitors. It also\nserves as a component of an ongoing project with wider coverage and deeper\nconsideration towards systematic LLM evaluation. Datasets, environments, and an\nintegrated evaluation package for AgentBench are released at\nhttps://github.com/THUDM/AgentBench\n","authors":["Xiao Liu","Hao Yu","Hanchen Zhang","Yifan Xu","Xuanyu Lei","Hanyu Lai","Yu Gu","Hangliang Ding","Kaiwen Men","Kejuan Yang","Shudan Zhang","Xiang Deng","Aohan Zeng","Zhengxiao Du","Chenhui Zhang","Sheng Shen","Tianjun Zhang","Yu Su","Huan Sun","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03688v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2308.03660v1","updated":"2023-08-07T15:20:20Z","published":"2023-08-07T15:20:20Z","title":"Detecting Spells in Fantasy Literature with a Transformer Based\n  Artificial Intelligence","summary":"  Transformer architectures and models have made significant progress in\nlanguage-based tasks. In this area, is BERT one of the most widely used and\nfreely available transformer architecture. In our work, we use BERT for\ncontext-based phrase recognition of magic spells in the Harry Potter novel\nseries. Spells are a common part of active magic in fantasy novels. Typically,\nspells are used in a specific context to achieve a supernatural effect. A\nseries of investigations were conducted to see if a Transformer architecture\ncould recognize such phrases based on their context in the Harry Potter saga.\nFor our studies a pre-trained BERT model was used and fine-tuned utilising\ndifferent datasets and training methods to identify the searched context. By\nconsidering different approaches for sequence classification as well as token\nclassification, it is shown that the context of spells can be recognised.\nAccording to our investigations, the examined sequence length for fine-tuning\nand validation of the model plays a significant role in context recognition.\nBased on this, we have investigated whether spells have overarching properties\nthat allow a transfer of the neural network models to other fantasy universes\nas well. The application of our model showed promising results and is worth to\nbe deepened in subsequent studies.\n","authors":["Marcel Moravek","Alexander Zender","Andreas Müller"],"pdf_url":"https://arxiv.org/pdf/2308.03660v1.pdf","comment":"18 pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.03656v1","updated":"2023-08-07T15:18:30Z","published":"2023-08-07T15:18:30Z","title":"Emotionally Numb or Empathetic? Evaluating How LLMs Feel Using\n  EmotionBench","summary":"  Recently, the community has witnessed the advancement of Large Language\nModels (LLMs), which have shown remarkable performance on various downstream\ntasks. Led by powerful models like ChatGPT and Claude, LLMs are revolutionizing\nhow users engage with software, assuming more than mere tools but intelligent\nassistants. Consequently, evaluating LLMs' anthropomorphic capabilities becomes\nincreasingly important in contemporary discourse. Utilizing the emotion\nappraisal theory from psychology, we propose to evaluate the empathy ability of\nLLMs, i.e., how their feelings change when presented with specific situations.\nAfter a careful and comprehensive survey, we collect a dataset containing over\n400 situations that have proven effective in eliciting the eight emotions\ncentral to our study. Categorizing the situations into 36 factors, we conduct a\nhuman evaluation involving more than 1,200 subjects worldwide. With the human\nevaluation results as references, our evaluation includes five LLMs, covering\nboth commercial and open-source models, including variations in model sizes,\nfeaturing the latest iterations, such as GPT-4 and LLaMA 2. A conclusion can be\ndrawn from the results that, despite several misalignments, LLMs can generally\nrespond appropriately to certain situations. Nevertheless, they fall short in\nalignment with the emotional behaviors of human beings and cannot establish\nconnections between similar situations. Our collected dataset of situations,\nthe human evaluation results, and the code of our testing framework, dubbed\nEmotionBench, is made publicly in https://github.com/CUHK-ARISE/EmotionBench.\nWe aspire to contribute to the advancement of LLMs regarding better alignment\nwith the emotional behaviors of human beings, thereby enhancing their utility\nand applicability as intelligent assistants.\n","authors":["Jen-tse Huang","Man Ho Lam","Eric John Li","Shujie Ren","Wenxuan Wang","Wenxiang Jiao","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2308.03656v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.00121v2","updated":"2023-08-07T14:57:11Z","published":"2023-07-24T19:59:22Z","title":"Getting pwn'd by AI: Penetration Testing with Large Language Models","summary":"  The field of software security testing, more specifically penetration\ntesting, is an activity that requires high levels of expertise and involves\nmany manual testing and analysis steps. This paper explores the potential usage\nof large-language models, such as GPT3.5, to augment penetration testers with\nAI sparring partners. We explore the feasibility of supplementing penetration\ntesters with AI models for two distinct use cases: high-level task planning for\nsecurity testing assignments and low-level vulnerability hunting within a\nvulnerable virtual machine. For the latter, we implemented a closed-feedback\nloop between LLM-generated low-level actions with a vulnerable virtual machine\n(connected through SSH) and allowed the LLM to analyze the machine state for\nvulnerabilities and suggest concrete attack vectors which were automatically\nexecuted within the virtual machine. We discuss promising initial results,\ndetail avenues for improvement, and close deliberating on the ethics of\nproviding AI-based sparring partners.\n","authors":["Andreas Happe","Jürgen Cito"],"pdf_url":"https://arxiv.org/pdf/2308.00121v2.pdf","comment":"5 pages, 1 figure, vision paper FSE'23"},{"id":"http://arxiv.org/abs/2308.03638v1","updated":"2023-08-07T14:42:49Z","published":"2023-08-07T14:42:49Z","title":"KITLM: Domain-Specific Knowledge InTegration into Language Models for\n  Question Answering","summary":"  Large language models (LLMs) have demonstrated remarkable performance in a\nwide range of natural language tasks. However, as these models continue to grow\nin size, they face significant challenges in terms of computational costs.\nAdditionally, LLMs often lack efficient domain-specific understanding, which is\nparticularly crucial in specialized fields such as aviation and healthcare. To\nboost the domain-specific understanding, we propose, KITLM, a novel knowledge\nbase integration approach into language model through relevant information\ninfusion. By integrating pertinent knowledge, not only the performance of the\nlanguage model is greatly enhanced, but the model size requirement is also\nsignificantly reduced while achieving comparable performance. Our proposed\nknowledge-infused model surpasses the performance of both GPT-3.5-turbo and the\nstate-of-the-art knowledge infusion method, SKILL, achieving over 1.5 times\nimprovement in exact match scores on the MetaQA. KITLM showed a similar\nperformance boost in the aviation domain with AeroQA. The drastic performance\nimprovement of KITLM over the existing methods can be attributed to the\ninfusion of relevant knowledge while mitigating noise. In addition, we release\ntwo curated datasets to accelerate knowledge infusion research in specialized\nfields: a) AeroQA, a new benchmark dataset designed for multi-hop\nquestion-answering within the aviation domain, and b) Aviation Corpus, a\ndataset constructed from unstructured text extracted from the National\nTransportation Safety Board reports. Our research contributes to advancing the\nfield of domain-specific language understanding and showcases the potential of\nknowledge infusion techniques in improving the performance of language models\non question-answering.\n","authors":["Ankush Agarwal","Sakharam Gawade","Amar Prakash Azad","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2308.03638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03629v1","updated":"2023-08-07T14:36:03Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v1.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2308.03601v1","updated":"2023-08-07T14:04:15Z","published":"2023-08-07T14:04:15Z","title":"Negative Lexical Constraints in Neural Machine Translation","summary":"  This paper explores negative lexical constraining in English to Czech neural\nmachine translation. Negative lexical constraining is used to prohibit certain\nwords or expressions in the translation produced by the neural translation\nmodel. We compared various methods based on modifying either the decoding\nprocess or the training data. The comparison was performed on two tasks:\nparaphrasing and feedback-based translation refinement. We also studied to\nwhich extent these methods \"evade\" the constraints presented to the model\n(usually in the dictionary form) by generating a different surface form of a\ngiven constraint.We propose a way to mitigate the issue through training with\nstemmed negative constraints to counter the model's ability to induce a variety\nof the surface forms of a word that can result in bypassing the constraint. We\ndemonstrate that our method improves the constraining, although the problem\nstill persists in many cases.\n","authors":["Josef Jon","Dušan Variš","Michal Novák","João Paulo Aires","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2308.03601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03582v1","updated":"2023-08-07T13:38:54Z","published":"2023-08-07T13:38:54Z","title":"WIKITIDE: A Wikipedia-Based Timestamped Definition Pairs Dataset","summary":"  A fundamental challenge in the current NLP context, dominated by language\nmodels, comes from the inflexibility of current architectures to 'learn' new\ninformation. While model-centric solutions like continual learning or\nparameter-efficient fine tuning are available, the question still remains of\nhow to reliably identify changes in language or in the world. In this paper, we\npropose WikiTiDe, a dataset derived from pairs of timestamped definitions\nextracted from Wikipedia. We argue that such resource can be helpful for\naccelerating diachronic NLP, specifically, for training models able to scan\nknowledge resources for core updates concerning a concept, an event, or a named\nentity. Our proposed end-to-end method is fully automatic, and leverages a\nbootstrapping algorithm for gradually creating a high-quality dataset. Our\nresults suggest that bootstrapping the seed version of WikiTiDe leads to better\nfine-tuned models. We also leverage fine-tuned models in a number of downstream\ntasks, showing promising results with respect to competitive baselines.\n","authors":["Hsuvas Borkakoty","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2308.03582v1.pdf","comment":"Accepted by RANLP 2023 main conference"},{"id":"http://arxiv.org/abs/2308.03581v1","updated":"2023-08-07T13:37:05Z","published":"2023-08-07T13:37:05Z","title":"Towards Controllable Natural Language Inference through Lexical\n  Inference Types","summary":"  Explainable natural language inference aims to provide a mechanism to produce\nexplanatory (abductive) inference chains which ground claims to their\nsupporting premises. A recent corpus called EntailmentBank strives to advance\nthis task by explaining the answer to a question using an entailment tree\n\\cite{dalvi2021explaining}. They employ the T5 model to directly generate the\ntree, which can explain how the answer is inferred. However, it lacks the\nability to explain and control the generation of intermediate steps, which is\ncrucial for the multi-hop inference process. % One recent corpus,\nEntailmentBank, aims to push this task forward by explaining an answer to a\nquestion according to an entailment tree \\cite{dalvi2021explaining}. They\nemploy T5 to generate the tree directly, which can explain how the answer is\ninferred but cannot explain how the intermediate is generated, which is\nessential to the multi-hop inference process. In this work, we focus on\nproposing a controlled natural language inference architecture for\nmulti-premise explanatory inference. To improve control and enable explanatory\nanalysis over the generation, we define lexical inference types based on\nAbstract Meaning Representation (AMR) graph and modify the architecture of T5\nto learn a latent sentence representation (T5 bottleneck) conditioned on said\ntype information. We also deliver a dataset of approximately 5000 annotated\nexplanatory inference steps, with well-grounded lexical-symbolic operations.\nExperimental results indicate that the inference typing induced at the T5\nbottleneck can help T5 to generate a conclusion under explicit control.\n","authors":["Yingji Zhang","Danilo S. Carvalho","Ian Pratt-Hartmann","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.03581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12375v2","updated":"2023-08-07T13:22:01Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03565v1","updated":"2023-08-07T13:16:42Z","published":"2023-08-07T13:16:42Z","title":"Topological Interpretations of GPT-3","summary":"  This is an experiential study of investigating a consistent method for\nderiving the correlation between sentence vector and semantic meaning of a\nsentence. We first used three state-of-the-art word/sentence embedding methods\nincluding GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence\nstrings into high dimensional spaces. Then we compute the pairwise distance\nbetween any possible combination of two sentence vectors in an embedding space\nand map them into a matrix. Based on each distance matrix, we compute the\ncorrelation of distances of a sentence vector with respect to the other\nsentence vectors in an embedding space. Then we compute the correlation of each\npair of the distance matrices. We observed correlations of the same sentence in\ndifferent embedding spaces and correlations of different sentences in the same\nembedding space. These observations are consistent with our hypothesis and take\nus to the next stage.\n","authors":["Tianyi Sun","Bradley Nelson"],"pdf_url":"https://arxiv.org/pdf/2308.03565v1.pdf","comment":"70 pages"},{"id":"http://arxiv.org/abs/2308.03558v1","updated":"2023-08-07T13:10:35Z","published":"2023-08-07T13:10:35Z","title":"Mondrian: Prompt Abstraction Attack Against Large Language Models for\n  Cheaper API Pricing","summary":"  The Machine Learning as a Service (MLaaS) market is rapidly expanding and\nbecoming more mature. For example, OpenAI's ChatGPT is an advanced large\nlanguage model (LLM) that generates responses for various queries with\nassociated fees. Although these models can deliver satisfactory performance,\nthey are far from perfect. Researchers have long studied the vulnerabilities\nand limitations of LLMs, such as adversarial attacks and model toxicity.\nInevitably, commercial ML models are also not exempt from such issues, which\ncan be problematic as MLaaS continues to grow. In this paper, we discover a new\nattack strategy against LLM APIs, namely the prompt abstraction attack.\nSpecifically, we propose Mondrian, a simple and straightforward method that\nabstracts sentences, which can lower the cost of using LLM APIs. In this\napproach, the adversary first creates a pseudo API (with a lower established\nprice) to serve as the proxy of the target API (with a higher established\nprice). Next, the pseudo API leverages Mondrian to modify the user query,\nobtain the abstracted response from the target API, and forward it back to the\nend user. Our results show that Mondrian successfully reduces user queries'\ntoken length ranging from 13% to 23% across various tasks, including text\nclassification, generation, and question answering. Meanwhile, these abstracted\nqueries do not significantly affect the utility of task-specific and general\nlanguage models like ChatGPT. Mondrian also reduces instruction prompts' token\nlength by at least 11% without compromising output quality. As a result, the\nprompt abstraction attack enables the adversary to profit without bearing the\ncost of API development and deployment.\n","authors":["Wai Man Si","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03549v1","updated":"2023-08-07T12:56:13Z","published":"2023-08-07T12:56:13Z","title":"Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language\n  Model through Expert Feedback and Real-world Multi-turn Dialogue","summary":"  Recent advances in Large Language Models (LLMs) have achieved remarkable\nbreakthroughs in understanding and responding to user intents. However, their\nperformance lag behind general use cases in some expertise domains, such as\nChinese medicine. Existing efforts to incorporate Chinese medicine into LLMs\nrely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue\ndata. These models lack the ability for doctor-like proactive inquiry and\nmulti-turn comprehension and cannot always align responses with safety and\nprofessionalism experts. In this work, we introduce Zhongjing, the first\nChinese medical LLaMA-based LLM that implements an entire training pipeline\nfrom pre-training to reinforcement learning with human feedback (RLHF).\nAdditionally, we introduce a Chinese multi-turn medical dialogue dataset of\n70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly\nenhances the model's capability for complex dialogue and proactive inquiry\ninitiation. We define a refined annotation rule and evaluation criteria given\nthe biomedical domain's unique characteristics. Results show that our model\noutperforms baselines in various capacities and matches the performance of\nChatGPT in a few abilities, despite having 50x training data with previous best\nmodel and 100x parameters with ChatGPT. RLHF further improves the model's\ninstruction-following ability and safety. We also release our code, datasets\nand model for further research.\n","authors":["Songhua Yang","Hanjia Zhao","Senbin Zhu","Guangyu Zhou","Hongfei Xu","Yuxiang Jia","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2308.03549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03531v1","updated":"2023-08-07T12:30:00Z","published":"2023-08-07T12:30:00Z","title":"Measuring Variety, Balance, and Disparity: An Analysis of Media Coverage\n  of the 2021 German Federal Election","summary":"  Determining and measuring diversity in news articles is important for a\nnumber of reasons, including preventing filter bubbles and fueling public\ndiscourse, especially before elections. So far, the identification and analysis\nof diversity have been illuminated in a variety of ways, such as measuring the\noverlap of words or topics between news articles related to US elections.\nHowever, the question of how diversity in news articles can be measured\nholistically, i.e., with respect to (1) variety, (2) balance, and (3)\ndisparity, considering individuals, parties, and topics, has not been\naddressed. In this paper, we present a framework for determining diversity in\nnews articles according to these dimensions. Furthermore, we create and provide\na dataset of Google Top Stories, encompassing more than 26,000 unique headlines\nfrom more than 900 news outlets collected within two weeks before and after the\n2021 German federal election. While we observe high diversity for more general\nsearch terms (e.g., \"election\"), a range of search terms (\"education,\"\n\"Europe,\" \"climate protection,\" \"government\") resulted in news articles with\nhigh diversity in two out of three dimensions. This reflects a more subjective,\ndedicated discussion on rather future-oriented topics.\n","authors":["Michael Färber","Jannik Schwade","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2308.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03519v1","updated":"2023-08-07T12:13:25Z","published":"2023-08-07T12:13:25Z","title":"Vocab-Expander: A System for Creating Domain-Specific Vocabularies Based\n  on Word Embeddings","summary":"  In this paper, we propose Vocab-Expander at https://vocab-expander.com, an\nonline tool that enables end-users (e.g., technology scouts) to create and\nexpand a vocabulary of their domain of interest. It utilizes an ensemble of\nstate-of-the-art word embedding techniques based on web text and ConceptNet, a\ncommon-sense knowledge base, to suggest related terms for already given terms.\nThe system has an easy-to-use interface that allows users to quickly confirm or\nreject term suggestions. Vocab-Expander offers a variety of potential use\ncases, such as improving concept-based information retrieval in technology and\ninnovation management, enhancing communication and collaboration within\norganizations or interdisciplinary projects, and creating vocabularies for\nspecific courses in education.\n","authors":["Michael Färber","Nicholas Popovic"],"pdf_url":"https://arxiv.org/pdf/2308.03519v1.pdf","comment":"accepted at RANLP'23"},{"id":"http://arxiv.org/abs/2307.00925v4","updated":"2023-08-07T11:40:59Z","published":"2023-07-03T10:53:05Z","title":"Automatic Design of Semantic Similarity Ensembles Using Grammatical\n  Evolution","summary":"  Semantic similarity measures are widely used in natural language processing\nto catalyze various computer-related tasks. However, no single semantic\nsimilarity measure is the most appropriate for all tasks, and researchers often\nuse ensemble strategies to ensure performance. This research work proposes a\nmethod for automatically designing semantic similarity ensembles. In fact, our\nproposed method uses grammatical evolution, for the first time, to\nautomatically select and aggregate measures from a pool of candidates to create\nan ensemble that maximizes correlation to human judgment. The method is\nevaluated on several benchmark datasets and compared to state-of-the-art\nensembles, showing that it can significantly improve similarity assessment\naccuracy and outperform existing methods in some cases. As a result, our\nresearch demonstrates the potential of using grammatical evolution to\nautomatically compare text and prove the benefits of using ensembles for\nsemantic similarity tasks. The source code that illustrates our approach can be\ndownloaded from https://github.com/jorge-martinez-gil/sesige.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.00925v4.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2211.08264v2","updated":"2023-08-07T11:22:16Z","published":"2022-11-15T16:14:39Z","title":"QAmeleon: Multilingual QA with Only 5 Examples","summary":"  The availability of large, high-quality datasets has been one of the main\ndrivers of recent progress in question answering (QA). Such annotated datasets\nhowever are difficult and costly to collect, and rarely exist in languages\nother than English, rendering QA technology inaccessible to underrepresented\nlanguages. An alternative to building large monolingual training datasets is to\nleverage pre-trained language models (PLMs) under a few-shot learning setting.\nOur approach, QAmeleon, uses a PLM to automatically generate multilingual data\nupon which QA models are trained, thus avoiding costly annotation. Prompt\ntuning the PLM for data synthesis with only five examples per language delivers\naccuracy superior to translation-based baselines, bridges nearly 60% of the gap\nbetween an English-only baseline and a fully supervised upper bound trained on\nalmost 50,000 hand labeled examples, and always leads to substantial\nimprovements compared to fine-tuning a QA model directly on labeled examples in\nlow resource settings. Experiments on the TyDiQA-GoldP and MLQA benchmarks show\nthat few-shot prompt tuning for data synthesis scales across languages and is a\nviable alternative to large-scale annotation.\n","authors":["Priyanka Agrawal","Chris Alberti","Fantine Huot","Joshua Maynez","Ji Ma","Sebastian Ruder","Kuzman Ganchev","Dipanjan Das","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2211.08264v2.pdf","comment":"To Appear at Transactions of Association for Computational\n  Linguistics (TACL)"},{"id":"http://arxiv.org/abs/2301.05880v2","updated":"2023-08-07T10:36:44Z","published":"2023-01-14T10:18:22Z","title":"TikTalk: A Video-Based Dialogue Dataset for Multi-Modal Chitchat in Real\n  World","summary":"  To facilitate the research on intelligent and human-like chatbots with\nmulti-modal context, we introduce a new video-based multi-modal dialogue\ndataset, called TikTalk. We collect 38K videos from a popular video-sharing\nplatform, along with 367K conversations posted by users beneath them. Users\nengage in spontaneous conversations based on their multi-modal experiences from\nwatching videos, which helps recreate real-world chitchat context. Compared to\nprevious multi-modal dialogue datasets, the richer context types in TikTalk\nlead to more diverse conversations, but also increase the difficulty in\ncapturing human interests from intricate multi-modal information to generate\npersonalized responses. Moreover, external knowledge is more frequently evoked\nin our dataset. These facts reveal new challenges for multi-modal dialogue\nmodels. We quantitatively demonstrate the characteristics of TikTalk, propose a\nvideo-based multi-modal chitchat task, and evaluate several dialogue baselines.\nExperimental results indicate that the models incorporating large language\nmodels (LLM) can generate more diverse responses, while the model utilizing\nknowledge graphs to introduce external knowledge performs the best overall.\nFurthermore, no existing model can solve all the above challenges well. There\nis still a large room for future improvements, even for LLM with visual\nextensions. Our dataset is available at\n\\url{https://ruc-aimind.github.io/projects/TikTalk/}.\n","authors":["Hongpeng Lin","Ludan Ruan","Wenke Xia","Peiyu Liu","Jingyuan Wen","Yixin Xu","Di Hu","Ruihua Song","Wayne Xin Zhao","Qin Jin","Zhiwu Lu"],"pdf_url":"https://arxiv.org/pdf/2301.05880v2.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.03449v1","updated":"2023-08-07T10:11:42Z","published":"2023-08-07T10:11:42Z","title":"Knowledge-preserving Pruning for Pre-trained Language Models without\n  Retraining","summary":"  Given a pre-trained language model, how can we efficiently compress it\nwithout retraining? Retraining-free structured pruning algorithms are crucial\nin pre-trained language model compression due to their significantly reduced\npruning cost and capability to prune large language models. However, existing\nretraining-free algorithms encounter severe accuracy degradation, as they fail\nto preserve the useful knowledge of pre-trained models. In this paper, we\npropose K-pruning (Knowledge-preserving pruning), an accurate retraining-free\nstructured pruning algorithm for pre-trained language models. K-pruning\nidentifies and prunes attention heads and neurons deemed to be superfluous,\nbased on the amount of their inherent knowledge. K-pruning applies an iterative\nprocess of pruning followed by knowledge reconstruction for each sub-layer to\npreserve the knowledge of the pre-trained models. Consequently, K-pruning shows\nup to 58.02%p higher F1 score than existing retraining-free pruning algorithms\nunder a high compression rate of 80% on the SQuAD benchmark.\n","authors":["Seungcheol Park","Hojun Choi","U Kang"],"pdf_url":"https://arxiv.org/pdf/2308.03449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01633v2","updated":"2023-08-07T09:54:55Z","published":"2023-05-02T17:46:12Z","title":"Missing Information, Unresponsive Authors, Experimental Flaws: The\n  Impossibility of Assessing the Reproducibility of Previous Human Evaluations\n  in NLP","summary":"  We report our efforts in identifying a set of previous human evaluations in\nNLP that would be suitable for a coordinated study examining what makes human\nevaluations in NLP more/less reproducible. We present our results and findings,\nwhich include that just 13\\% of papers had (i) sufficiently low barriers to\nreproduction, and (ii) enough obtainable information, to be considered for\nreproduction, and that all but one of the experiments we selected for\nreproduction was discovered to have flaws that made the meaningfulness of\nconducting a reproduction questionable. As a result, we had to change our\ncoordinated study design from a reproduce approach to a\nstandardise-then-reproduce-twice approach. Our overall (negative) finding that\nthe great majority of human evaluations in NLP is not repeatable and/or not\nreproducible and/or too flawed to justify reproduction, paints a dire picture,\nbut presents an opportunity for a rethink about how to design and report human\nevaluations in NLP.\n","authors":["Anya Belz","Craig Thomson","Ehud Reiter","Gavin Abercrombie","Jose M. Alonso-Moral","Mohammad Arvan","Anouck Braggaar","Mark Cieliebak","Elizabeth Clark","Kees van Deemter","Tanvi Dinkar","Ondřej Dušek","Steffen Eger","Qixiang Fang","Mingqi Gao","Albert Gatt","Dimitra Gkatzia","Javier González-Corbelle","Dirk Hovy","Manuela Hürlimann","Takumi Ito","John D. Kelleher","Filip Klubicka","Emiel Krahmer","Huiyuan Lai","Chris van der Lee","Yiru Li","Saad Mahamood","Margot Mieskes","Emiel van Miltenburg","Pablo Mosteiro","Malvina Nissim","Natalie Parde","Ondřej Plátek","Verena Rieser","Jie Ruan","Joel Tetreault","Antonio Toral","Xiaojun Wan","Leo Wanner","Lewis Watson","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.01633v2.pdf","comment":"5 pages plus appendix, 4 tables, 1 figure. To appear at \"Workshop on\n  Insights from Negative Results in NLP\" (co-located with EACL2023). Updated\n  author list and acknowledgements"},{"id":"http://arxiv.org/abs/2308.03429v1","updated":"2023-08-07T09:24:24Z","published":"2023-08-07T09:24:24Z","title":"RCMHA: Relative Convolutional Multi-Head Attention for Natural Language\n  Modelling","summary":"  The Attention module finds common usage in language modeling, presenting\ndistinct challenges within the broader scope of Natural Language Processing.\nMulti-Head Attention (MHA) employs an absolute positional encoding, which\nimposes limitations on token length and entails substantial memory consumption\nduring the processing of embedded inputs. The current remedy proposed by\nresearchers involves the utilization of relative positional encoding, similar\nto the approach adopted in Transformer-XL or Relative Multi-Head Attention\n(RMHA), albeit the employed architecture consumes considerable memory\nresources. To address these challenges, this study endeavors to refine MHA,\nleveraging relative positional encoding in conjunction with the Depth-Wise\nConvolutional Layer architecture, which promises heightened accuracy coupled\nwith minimized memory usage. The proposed RCMHA framework entails the\nmodification of two integral components: firstly, the application of the\nDepth-Wise Convolutional Layer to the input embedding, encompassing Query, Key,\nand Value parameters; secondly, the incorporation of Relative Positional\nEncoding into the attention scoring phase, harmoniously integrated with Scaled\nDot-Product Attention. Empirical experiments underscore the advantages of\nRCMHA, wherein it exhibits superior accuracy, boasting a score of 0.572 in\ncomparison to alternative attention modules such as MHA, Multi-DConv-Head\nAttention (MDHA), and RMHA. Concerning memory utilization, RMHA emerges as the\nmost frugal, demonstrating an average consumption of 2.98 GB, surpassing RMHA\nwhich necessitates 3.5 GB.\n","authors":["Herman Sugiharto"," Aradea","Husni Mubarok"],"pdf_url":"https://arxiv.org/pdf/2308.03429v1.pdf","comment":"13 pages, 13 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.03423v1","updated":"2023-08-07T09:19:59Z","published":"2023-08-07T09:19:59Z","title":"Boosting Chinese ASR Error Correction with Dynamic Error Scaling\n  Mechanism","summary":"  Chinese Automatic Speech Recognition (ASR) error correction presents\nsignificant challenges due to the Chinese language's unique features, including\na large character set and borderless, morpheme-based structure. Current\nmainstream models often struggle with effectively utilizing word-level features\nand phonetic information. This paper introduces a novel approach that\nincorporates a dynamic error scaling mechanism to detect and correct\nphonetically erroneous text generated by ASR output. This mechanism operates by\ndynamically fusing word-level features and phonetic information, thereby\nenriching the model with additional semantic data. Furthermore, our method\nimplements unique error reduction and amplification strategies to address the\nissues of matching wrong words caused by incorrect characters. Experimental\nresults indicate substantial improvements in ASR error correction,\ndemonstrating the effectiveness of our proposed method and yielding promising\nresults on established datasets.\n","authors":["Jiaxin Fan","Yong Zhang","Hanzhang Li","Jianzong Wang","Zhitao Li","Sheng Ouyang","Ning Cheng","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03423v1.pdf","comment":"Accepted by 24th Annual Conference of the International Speech\n  Communication Association (INTERSPEECH 2023)"},{"id":"http://arxiv.org/abs/2306.11518v2","updated":"2023-08-07T09:17:43Z","published":"2023-06-20T13:12:58Z","title":"One model to rule them all: ranking Slovene summarizers","summary":"  Text summarization is an essential task in natural language processing, and\nresearchers have developed various approaches over the years, ranging from\nrule-based systems to neural networks. However, there is no single model or\napproach that performs well on every type of text. We propose a system that\nrecommends the most suitable summarization model for a given text. The proposed\nsystem employs a fully connected neural network that analyzes the input content\nand predicts which summarizer should score the best in terms of ROUGE score for\na given input. The meta-model selects among four different summarization\nmodels, developed for the Slovene language, using different properties of the\ninput, in particular its Doc2Vec document representation. The four Slovene\nsummarization models deal with different challenges associated with text\nsummarization in a less-resourced language. We evaluate the proposed SloMetaSum\nmodel performance automatically and parts of it manually. The results show that\nthe system successfully automates the step of manually selecting the best\nmodel.\n","authors":["Aleš Žagar","Marko Robnik-Šikonja"],"pdf_url":"https://arxiv.org/pdf/2306.11518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03422v1","updated":"2023-08-07T09:15:03Z","published":"2023-08-07T09:15:03Z","title":"Prompt Guided Copy Mechanism for Conversational Question Answering","summary":"  Conversational Question Answering (CQA) is a challenging task that aims to\ngenerate natural answers for conversational flow questions. In this paper, we\npropose a pluggable approach for extractive methods that introduces a novel\nprompt-guided copy mechanism to improve the fluency and appropriateness of the\nextracted answers. Our approach uses prompts to link questions to answers and\nemploys attention to guide the copy mechanism to verify the naturalness of\nextracted answers, making necessary edits to ensure that the answers are fluent\nand appropriate. The three prompts, including a question-rationale relationship\nprompt, a question description prompt, and a conversation history prompt,\nenhance the copy mechanism's performance. Our experiments demonstrate that this\napproach effectively promotes the generation of natural answers and achieves\ngood results in the CoQA challenge.\n","authors":["Yong Zhang","Zhitao Li","Jianzong Wang","Yiming Gao","Ning Cheng","Fengying Yu","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03422v1.pdf","comment":"Accepted by 24th Annual Conference of the International Speech\n  Communication Association (INTERSPEECH 2023)"},{"id":"http://arxiv.org/abs/2308.03421v1","updated":"2023-08-07T09:14:33Z","published":"2023-08-07T09:14:33Z","title":"RecycleGPT: An Autoregressive Language Model with Recyclable Module","summary":"  Existing large language models have to run K times to generate a sequence of\nK tokens. In this paper, we present RecycleGPT, a generative language model\nwith fast decoding speed by recycling pre-generated model states without\nrunning the whole model in multiple steps. Our approach relies on the\nobservation that adjacent tokens in a sequence usually have strong correlations\nand the next token in a sequence can be reasonably guessed or inferred based on\nthe preceding ones. Through theoretical evaluations and practical tests on\ndownstream text generation tasks, we demonstrate the effectiveness of our\napproach in lowering inference latency, achieving up to 1.4x speedup while\npreserving high performance.\n","authors":["Yufan Jiang","Qiaozhi He","Xiaomin Zhuang","Zhihua Wu","Kunpeng Wang","Wenlai Zhao","Guangwen Yang"],"pdf_url":"https://arxiv.org/pdf/2308.03421v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.10511v2","updated":"2023-08-07T09:08:23Z","published":"2023-07-20T00:36:41Z","title":"General Debiasing for Multimodal Sentiment Analysis","summary":"  Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal\ninformation for prediction yet unavoidably suffers from fitting the spurious\ncorrelations between multimodal features and sentiment labels. For example, if\nmost videos with a blue background have positive labels in a dataset, the model\nwill rely on such correlations for prediction, while \"blue background\" is not a\nsentiment-related feature. To address this problem, we define a general\ndebiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)\ngeneralization ability of MSA models by reducing their reliance on spurious\ncorrelations. To this end, we propose a general debiasing framework based on\nInverse Probability Weighting (IPW), which adaptively assigns small weights to\nthe samples with larger bias (i.e., the severer spurious correlations). The key\nto this debiasing framework is to estimate the bias of each sample, which is\nachieved by two steps: 1) disentangling the robust features and biased features\nin each modality, and 2) utilizing the biased features to estimate the bias.\nFinally, we employ IPW to reduce the effects of large-biased samples,\nfacilitating robust feature learning for sentiment prediction. To examine the\nmodel's generalization ability, we keep the original testing sets on two\nbenchmarks and additionally construct multiple unimodal and multimodal OOD\ntesting sets. The empirical results demonstrate the superior generalization\nability of our proposed framework. We have released the code and data to\nfacilitate the reproduction https://github.com/Teng-Sun/GEAR.\n","authors":["Teng Sun","Juntong Ni","Wenjie Wang","Liqiang Jing","Yinwei Wei","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2307.10511v2.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03415v1","updated":"2023-08-07T09:06:20Z","published":"2023-08-07T09:06:20Z","title":"End-to-End Evaluation for Low-Latency Simultaneous Speech Translation","summary":"  The challenge of low-latency speech translation has recently draw significant\ninterest in the research community as shown by several publications and shared\ntasks. Therefore, it is essential to evaluate these different approaches in\nrealistic scenarios. However, currently only specific aspects of the systems\nare evaluated and often it is not possible to compare different approaches.\n  In this work, we propose the first framework to perform and evaluate the\nvarious aspects of low-latency speech translation under realistic conditions.\nThe evaluation is carried out in an end-to-end fashion. This includes the\nsegmentation of the audio as well as the run-time of the different components.\n  Secondly, we compare different approaches to low-latency speech translation\nusing this framework. We evaluate models with the option to revise the output\nas well as methods with fixed output. Furthermore, we directly compare\nstate-of-the-art cascaded as well as end-to-end systems. Finally, the framework\nallows to automatically evaluate the translation quality as well as latency and\nalso provides a web interface to show the low-latency model outputs to the\nuser.\n","authors":["Christian Huber","Tu Anh Dinh","Carlos Mullov","Ngoc Quan Pham","Thai Binh Nguyen","Fabian Retkowski","Stefan Constantin","Enes Yavuz Ugan","Danni Liu","Zhaolin Li","Sai Koneru","Jan Niehues","Alexander Waibel"],"pdf_url":"https://arxiv.org/pdf/2308.03415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08283v3","updated":"2023-08-07T08:32:54Z","published":"2022-12-16T05:10:09Z","title":"SceneGATE: Scene-Graph based co-Attention networks for TExt visual\n  question answering","summary":"  Most TextVQA approaches focus on the integration of objects, scene texts and\nquestion words by a simple transformer encoder. But this fails to capture the\nsemantic relations between different modalities. The paper proposes a Scene\nGraph based co-Attention Network (SceneGATE) for TextVQA, which reveals the\nsemantic relations among the objects, Optical Character Recognition (OCR)\ntokens and the question words. It is achieved by a TextVQA-based scene graph\nthat discovers the underlying semantics of an image. We created a\nguided-attention module to capture the intra-modal interplay between the\nlanguage and the vision as a guidance for inter-modal interactions. To make\nexplicit teaching of the relations between the two modalities, we proposed and\nintegrated two attention modules, namely a scene graph-based semantic\nrelation-aware attention and a positional relation-aware attention. We\nconducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.\nIt is shown that our SceneGATE method outperformed existing ones because of the\nscene graph and its attention modules.\n","authors":["Feiqi Cao","Siwen Luo","Felipe Nunez","Zean Wen","Josiah Poon","Caren Han"],"pdf_url":"https://arxiv.org/pdf/2212.08283v3.pdf","comment":"Published in Robotics (Q1, SCI indexed Journal):\n  https://www.mdpi.com/2218-6581/12/4/114"},{"id":"http://arxiv.org/abs/2207.14116v4","updated":"2023-08-07T07:54:45Z","published":"2022-07-28T14:30:06Z","title":"Claim-Dissector: An Interpretable Fact-Checking System with Joint\n  Re-ranking and Veracity Prediction","summary":"  We present Claim-Dissector: a novel latent variable model for fact-checking\nand analysis, which given a claim and a set of retrieved evidences jointly\nlearns to identify: (i) the relevant evidences to the given claim, (ii) the\nveracity of the claim. We propose to disentangle the per-evidence relevance\nprobability and its contribution to the final veracity probability in an\ninterpretable way -- the final veracity probability is proportional to a linear\nensemble of per-evidence relevance probabilities. In this way, the individual\ncontributions of evidences towards the final predicted probability can be\nidentified. In per-evidence relevance probability, our model can further\ndistinguish whether each relevant evidence is supporting (S) or refuting (R)\nthe claim. This allows to quantify how much the S/R probability contributes to\nthe final verdict or to detect disagreeing evidence.\n  Despite its interpretable nature, our system achieves results competitive\nwith state-of-the-art on the FEVER dataset, as compared to typical two-stage\nsystem pipelines, while using significantly fewer parameters. It also sets new\nstate-of-the-art on FAVIQ and RealFC datasets. Furthermore, our analysis shows\nthat our model can learn fine-grained relevance cues while using coarse-grained\nsupervision, and we demonstrate it in 2 ways. (i) We show that our model can\nachieve competitive sentence recall while using only paragraph-level relevance\nsupervision. (ii) Traversing towards the finest granularity of relevance, we\nshow that our model is capable of identifying relevance at the token level. To\ndo this, we present a new benchmark TLR-FEVER focusing on token-level\ninterpretability -- humans annotate tokens in relevant evidences they\nconsidered essential when making their judgment. Then we measure how similar\nare these annotations to the tokens our model is focusing on.\n","authors":["Martin Fajcik","Petr Motlicek","Pavel Smrz"],"pdf_url":"https://arxiv.org/pdf/2207.14116v4.pdf","comment":"updated acknowledgement"},{"id":"http://arxiv.org/abs/2304.14104v2","updated":"2023-08-07T07:52:35Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v2.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2308.03365v1","updated":"2023-08-07T07:39:43Z","published":"2023-08-07T07:39:43Z","title":"Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine\n  Lexicon-based Retriever","summary":"  Few-shot and zero-shot entity linking focus on the tail and emerging\nentities, which are more challenging but closer to real-world scenarios. The\nmainstream method is the ''retrieve and rerank'' two-stage framework. In this\npaper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity\ncandidates in an effective manner, which operates in two layers. The first\nlayer retrieves coarse-grained candidates by leveraging entity names, while the\nsecond layer narrows down the search to fine-grained candidates within the\ncoarse-grained ones. In addition, this second layer utilizes entity\ndescriptions to effectively disambiguate tail or new entities that share names\nwith existing popular entities. Experimental results indicate that our approach\ncan obtain superior performance without requiring extensive finetuning in the\nretrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task\n6 on Chinese Few-shot and Zero-shot Entity Linking.\n","authors":["Shijue Huang","Bingbing Wang","Libo Qin","Qin Zhao","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03365v1.pdf","comment":"Accepted to NLPCC2023"},{"id":"http://arxiv.org/abs/2308.03360v1","updated":"2023-08-07T07:29:49Z","published":"2023-08-07T07:29:49Z","title":"Coupling Symbolic Reasoning with Language Modeling for Efficient\n  Longitudinal Understanding of Unstructured Electronic Medical Records","summary":"  The application of Artificial Intelligence (AI) in healthcare has been\nrevolutionary, especially with the recent advancements in transformer-based\nLarge Language Models (LLMs). However, the task of understanding unstructured\nelectronic medical records remains a challenge given the nature of the records\n(e.g., disorganization, inconsistency, and redundancy) and the inability of\nLLMs to derive reasoning paradigms that allow for comprehensive understanding\nof medical variables. In this work, we examine the power of coupling symbolic\nreasoning with language modeling toward improved understanding of unstructured\nclinical texts. We show that such a combination improves the extraction of\nseveral medical variables from unstructured records. In addition, we show that\nthe state-of-the-art commercially-free LLMs enjoy retrieval capabilities\ncomparable to those provided by their commercial counterparts. Finally, we\nelaborate on the need for LLM steering through the application of symbolic\nreasoning as the exclusive use of LLMs results in the lowest performance.\n","authors":["Shivani Shekhar","Simran Tiwari","T. C. Rensink","Ramy Eskander","Wael Salloum"],"pdf_url":"https://arxiv.org/pdf/2308.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03349v1","updated":"2023-08-07T07:03:49Z","published":"2023-08-07T07:03:49Z","title":"SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering\n  Dataset for Scientific Graphs","summary":"  In this work, we present SciGraphQA, a synthetic multi-turn question-answer\ndataset related to academic graphs. SciGraphQA is 13 times larger than\nChartVQA, the previously largest chart-visual question-answering dataset. It is\nalso the largest open-sourced chart VQA dataset with non-synthetic charts. To\nbuild our dataset, we selected 290,000 Computer Science or Machine Learning\nArXiv papers published between 2010 and 2020, and then used Palm-2 to generate\n295K samples of open-vocabulary multi-turn question-answering dialogues about\nthe graphs. As context, we provided the text-only Palm-2 with paper title,\nabstract, paragraph mentioning the graph, and rich text contextual data from\nthe graph itself, obtaining dialogues with an average 2.23 question-answer\nturns for each graph. We asked GPT-4 to assess the matching quality of our\nquestion-answer turns given the paper's context, obtaining an average rating of\n8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most\npopular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our\ndataset, finding LLaVA-13B being the most performant with a CIDEr score of\n0.08. We further enriched the question prompts for LLAVA by including the\nserialized data tables extracted from the graphs using the DePlot model,\nboosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,\nwe also fine-tuned LLaVa using our dataset, reaching a substantially higher\nCIDEr score of 0.26. We anticipate further accuracy improvement by including\nsegmentation mask tokens and leveraging larger LLM backbones coupled with\nemergent prompting techniques. Our code and data are open-sourced.\n","authors":["Shengzhi Li","Nima Tajbakhsh"],"pdf_url":"https://arxiv.org/pdf/2308.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03960v2","updated":"2023-08-07T06:35:25Z","published":"2023-05-06T07:06:47Z","title":"Beyond Rule-based Named Entity Recognition and Relation Extraction for\n  Process Model Generation from Natural Language Text","summary":"  Process-aware information systems offer extensive advantages to companies,\nfacilitating planning, operations, and optimization of day-to-day business\nactivities. However, the time-consuming but required step of designing formal\nbusiness process models often hampers the potential of these systems. To\novercome this challenge, automated generation of business process models from\nnatural language text has emerged as a promising approach to expedite this\nstep. Generally two crucial subtasks have to be solved: extracting\nprocess-relevant information from natural language and creating the actual\nmodel. Approaches towards the first subtask are rule based methods, highly\noptimized for specific domains, but hard to adapt to related applications. To\nsolve this issue, we present an extension to an existing pipeline, to make it\nentirely data driven. We demonstrate the competitiveness of our improved\npipeline, which not only eliminates the substantial overhead associated with\nfeature engineering and rule definition, but also enables adaptation to\ndifferent datasets, entity and relation types, and new domains. Additionally,\nthe largest available dataset (PET) for the first subtask, contains no\ninformation about linguistic references between mentions of entities in the\nprocess description. Yet, the resolution of these mentions into a single visual\nelement is essential for high quality process models. We propose an extension\nto the PET dataset that incorporates information about linguistic references\nand a corresponding method for resolving them. Finally, we provide a detailed\nanalysis of the inherent challenges in the dataset at hand.\n","authors":["Julian Neuberger","Lars Ackermann","Stefan Jablonski"],"pdf_url":"https://arxiv.org/pdf/2305.03960v2.pdf","comment":"Currently under review for CoopIS23"},{"id":"http://arxiv.org/abs/2305.18462v2","updated":"2023-08-07T06:32:56Z","published":"2023-05-29T07:06:03Z","title":"Membership Inference Attacks against Language Models via Neighbourhood\n  Comparison","summary":"  Membership Inference attacks (MIAs) aim to predict whether a data sample was\npresent in the training data of a machine learning model or not, and are widely\nused for assessing the privacy risks of language models. Most existing attacks\nrely on the observation that models tend to assign higher probabilities to\ntheir training samples than non-training points. However, simple thresholding\nof the model score in isolation tends to lead to high false-positive rates as\nit does not account for the intrinsic complexity of a sample. Recent work has\ndemonstrated that reference-based attacks which compare model scores to those\nobtained from a reference model trained on similar data can substantially\nimprove the performance of MIAs. However, in order to train reference models,\nattacks of this kind make the strong and arguably unrealistic assumption that\nan adversary has access to samples closely resembling the original training\ndata. Therefore, we investigate their performance in more realistic scenarios\nand find that they are highly fragile in relation to the data distribution used\nto train reference models. To investigate whether this fragility provides a\nlayer of safety, we propose and evaluate neighbourhood attacks, which compare\nmodel scores for a given sample to scores of synthetically generated neighbour\ntexts and therefore eliminate the need for access to the training data\ndistribution. We show that, in addition to being competitive with\nreference-based attacks that have perfect knowledge about the training data\ndistribution, our attack clearly outperforms existing reference-free attacks as\nwell as reference-based attacks with imperfect knowledge, which demonstrates\nthe need for a reevaluation of the threat model of adversarial attacks.\n","authors":["Justus Mattern","Fatemehsadat Mireshghallah","Zhijing Jin","Bernhard Schölkopf","Mrinmaya Sachan","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2305.18462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02047v2","updated":"2023-08-07T06:21:31Z","published":"2023-07-05T06:05:36Z","title":"CAME: Confidence-guided Adaptive Memory Efficient Optimization","summary":"  Adaptive gradient methods, such as Adam and LAMB, have demonstrated excellent\nperformance in the training of large language models. Nevertheless, the need\nfor adaptivity requires maintaining second-moment estimates of the\nper-parameter gradients, which entails a high cost of extra memory overheads.\nTo solve this problem, several memory-efficient optimizers (e.g., Adafactor)\nhave been proposed to obtain a drastic reduction in auxiliary memory usage, but\nwith a performance penalty. In this paper, we first study a confidence-guided\nstrategy to reduce the instability of existing memory efficient optimizers.\nBased on this strategy, we propose CAME to simultaneously achieve two goals:\nfast convergence as in traditional adaptive methods, and low memory usage as in\nmemory-efficient methods. Extensive experiments demonstrate the training\nstability and superior performance of CAME across various NLP tasks such as\nBERT and GPT-2 training. Notably, for BERT pre-training on the large batch size\nof 32,768, our proposed optimizer attains faster convergence and higher\naccuracy compared with the Adam optimizer. The implementation of CAME is\npublicly available.\n","authors":["Yang Luo","Xiaozhe Ren","Zangwei Zheng","Zhuo Jiang","Xin Jiang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2307.02047v2.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2308.03311v1","updated":"2023-08-07T05:40:01Z","published":"2023-08-07T05:40:01Z","title":"CrossTalk: Enhancing Communication and Collaboration in\n  Videoconferencing with Intent Recognition from Conversational Speech","summary":"  Despite the advances and ubiquity of digital communication media such as\nvideoconferencing and virtual reality, they remain oblivious to the rich\nintentions expressed by users. Beyond transmitting audio, videos, and messages,\nwe envision digital communication media as proactive facilitators that can\nprovide unobtrusive assistance to enhance communication and collaboration.\nInformed by the results of a formative study, we propose three key design\nconcepts to explore the systematic integration of intelligence into\ncommunication and collaboration, including the panel substrate, language-based\nintent recognition, and lightweight interaction techniques. We developed\nCrossTalk, a videoconferencing system that instantiates these concepts, which\nwas found to enable a more fluid and flexible communication and collaboration\nexperience.\n","authors":["Haijun Xia","Tony Wang","Aditya Gunturu","Peiling Jiang","William Duan","Xiaoshuo Yao"],"pdf_url":"https://arxiv.org/pdf/2308.03311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03303v1","updated":"2023-08-07T05:12:27Z","published":"2023-08-07T05:12:27Z","title":"LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models\n  Fine-tuning","summary":"  The low-rank adaptation (LoRA) method can largely reduce the amount of\ntrainable parameters for fine-tuning large language models (LLMs), however, it\nstill requires expensive activation memory to update low-rank weights. Reducing\nthe number of LoRA layers or using activation recomputation could harm the\nfine-tuning performance or increase the computational overhead. In this work,\nwe present LoRA-FA, a memory-efficient fine-tuning method that reduces the\nactivation memory without performance degradation and expensive recomputation.\nLoRA-FA chooses to freeze the projection-down weight of $A$ and update the\nprojection-up weight of $B$ in each LoRA layer. It ensures the change of model\nweight reside in a low-rank space during LLMs fine-tuning, while eliminating\nthe requirement to store full-rank input activations. We conduct extensive\nexperiments across multiple model types (RoBERTa, T5, LLaMA) and model scales.\nOur results show that LoRA-FA can always achieve close fine-tuning accuracy\nacross different tasks compared to full parameter fine-tuning and LoRA.\nFurthermore, LoRA-FA can reduce the overall memory cost by up to 1.4$\\times$\ncompared to LoRA.\n","authors":["Longteng Zhang","Lin Zhang","Shaohuai Shi","Xiaowen Chu","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2308.03303v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.03296v1","updated":"2023-08-07T04:47:42Z","published":"2023-08-07T04:47:42Z","title":"Studying Large Language Model Generalization with Influence Functions","summary":"  When trying to gain better visibility into a machine learning model in order\nto understand and mitigate the associated risks, a potentially valuable source\nof evidence is: which training examples most contribute to a given behavior?\nInfluence functions aim to answer a counterfactual: how would the model's\nparameters (and hence its outputs) change if a given sequence were added to the\ntraining set? While influence functions have produced insights for small\nmodels, they are difficult to scale to large language models (LLMs) due to the\ndifficulty of computing an inverse-Hessian-vector product (IHVP). We use the\nEigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)\napproximation to scale influence functions up to LLMs with up to 52 billion\nparameters. In our experiments, EK-FAC achieves similar accuracy to traditional\ninfluence function estimators despite the IHVP computation being orders of\nmagnitude faster. We investigate two algorithmic techniques to reduce the cost\nof computing gradients of candidate training sequences: TF-IDF filtering and\nquery batching. We use influence functions to investigate the generalization\npatterns of LLMs, including the sparsity of the influence patterns, increasing\nabstraction with scale, math and programming abilities, cross-lingual\ngeneralization, and role-playing behavior. Despite many apparently\nsophisticated forms of generalization, we identify a surprising limitation:\ninfluences decay to near-zero when the order of key phrases is flipped.\nOverall, influence functions give us a powerful new tool for studying the\ngeneralization properties of LLMs.\n","authors":["Roger Grosse","Juhan Bae","Cem Anil","Nelson Elhage","Alex Tamkin","Amirhossein Tajdini","Benoit Steiner","Dustin Li","Esin Durmus","Ethan Perez","Evan Hubinger","Kamilė Lukošiūtė","Karina Nguyen","Nicholas Joseph","Sam McCandlish","Jared Kaplan","Samuel R. Bowman"],"pdf_url":"https://arxiv.org/pdf/2308.03296v1.pdf","comment":"119 pages, 47 figures, 22 tables"},{"id":"http://arxiv.org/abs/2308.03293v1","updated":"2023-08-07T04:42:36Z","published":"2023-08-07T04:42:36Z","title":"Dialogue Systems Can Generate Appropriate Responses without the Use of\n  Question Marks? -- Investigation of the Effects of Question Marks on Dialogue\n  Systems","summary":"  When individuals engage in spoken discourse, various phenomena can be\nobserved that differ from those that are apparent in text-based conversation.\nWhile written communication commonly uses a question mark to denote a query, in\nspoken discourse, queries are frequently indicated by a rising intonation at\nthe end of a sentence. However, numerous speech recognition engines do not\nappend a question mark to recognized queries, presenting a challenge when\ncreating a spoken dialogue system. Specifically, the absence of a question mark\nat the end of a sentence can impede the generation of appropriate responses to\nqueries in spoken dialogue systems. Hence, we investigate the impact of\nquestion marks on dialogue systems, with the results showing that they have a\nsignificant impact. Moreover, we analyze specific examples in an effort to\ndetermine which types of utterances have the impact on dialogue systems.\n","authors":["Tomoya Mizumoto","Takato Yamazaki","Katsumasa Yoshikawa","Masaya Ohagi","Toshiki Kawamoto","Toshinori Sato"],"pdf_url":"https://arxiv.org/pdf/2308.03293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03291v1","updated":"2023-08-07T04:20:38Z","published":"2023-08-07T04:20:38Z","title":"SynJax: Structured Probability Distributions for JAX","summary":"  The development of deep learning software libraries enabled significant\nprogress in the field by allowing users to focus on modeling, while letting the\nlibrary to take care of the tedious and time-consuming task of optimizing\nexecution for modern hardware accelerators. However, this has benefited only\nparticular types of deep learning models, such as Transformers, whose\nprimitives map easily to the vectorized computation. The models that explicitly\naccount for structured objects, such as trees and segmentations, did not\nbenefit equally because they require custom algorithms that are difficult to\nimplement in a vectorized form.\n  SynJax directly addresses this problem by providing an efficient vectorized\nimplementation of inference algorithms for structured distributions covering\nalignment, tagging, segmentation, constituency trees and spanning trees. With\nSynJax we can build large-scale differentiable models that explicitly model\nstructure in the data. The code is available at\nhttps://github.com/deepmind/synjax.\n","authors":["Miloš Stanojević","Laurent Sartran"],"pdf_url":"https://arxiv.org/pdf/2308.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03281v1","updated":"2023-08-07T03:52:59Z","published":"2023-08-07T03:52:59Z","title":"Towards General Text Embeddings with Multi-stage Contrastive Learning","summary":"  We present GTE, a general-purpose text embedding model trained with\nmulti-stage contrastive learning. In line with recent advancements in unifying\nvarious NLP tasks into a single format, we train a unified text embedding model\nby employing contrastive learning over a diverse mixture of datasets from\nmultiple sources. By significantly increasing the number of training data\nduring both unsupervised pre-training and supervised fine-tuning stages, we\nachieve substantial performance gains over existing embedding models. Notably,\neven with a relatively modest parameter count of 110M, GTE$_\\text{base}$\noutperforms the black-box embedding API provided by OpenAI and even surpasses\n10x larger text embedding models on the massive text embedding benchmark.\nFurthermore, without additional fine-tuning on each programming language\nindividually, our model outperforms previous best code retrievers of similar\nsize by treating code as text. In summary, our model achieves impressive\nresults by effectively harnessing multi-stage contrastive learning, offering a\npowerful and efficient text embedding model with broad applicability across\nvarious NLP and code-related tasks.\n","authors":["Zehan Li","Xin Zhang","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03279v1","updated":"2023-08-07T03:39:52Z","published":"2023-08-07T03:39:52Z","title":"UniversalNER: Targeted Distillation from Large Language Models for Open\n  Named Entity Recognition","summary":"  Large language models (LLMs) have demonstrated remarkable generalizability,\nsuch as understanding arbitrary entities and relations. Instruction tuning has\nproven effective for distilling LLMs into more cost-efficient models such as\nAlpaca and Vicuna. Yet such student models still trail the original LLMs by\nlarge margins in downstream applications. In this paper, we explore targeted\ndistillation with mission-focused instruction tuning to train student models\nthat can excel in a broad application class such as open information\nextraction. Using named entity recognition (NER) for case study, we show how\nChatGPT can be distilled into much smaller UniversalNER models for open NER.\nFor evaluation, we assemble the largest NER benchmark to date, comprising 43\ndatasets across 9 diverse domains such as biomedicine, programming, social\nmedia, law, finance. Without using any direct supervision, UniversalNER attains\nremarkable NER accuracy across tens of thousands of entity types, outperforming\ngeneral instruction-tuned models such as Alpaca and Vicuna by over 30 absolute\nF1 points in average. With a tiny fraction of parameters, UniversalNER not only\nacquires ChatGPT's capability in recognizing arbitrary entity types, but also\noutperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,\nUniversalNER even outperforms by a large margin state-of-the-art multi-task\ninstruction-tuned systems such as InstructUIE, which uses supervised NER\nexamples. We also conduct thorough ablation studies to assess the impact of\nvarious components in our distillation approach. We will release the\ndistillation recipe, data, and UniversalNER models to facilitate future\nresearch on targeted distillation.\n","authors":["Wenxuan Zhou","Sheng Zhang","Yu Gu","Muhao Chen","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.03279v1.pdf","comment":"Project page: https://universal-ner.github.io/"},{"id":"http://arxiv.org/abs/2308.03277v1","updated":"2023-08-07T03:37:31Z","published":"2023-08-07T03:37:31Z","title":"From Ambiguity to Explicitness: NLP-Assisted 5G Specification\n  Abstraction for Formal Analysis","summary":"  Formal method-based analysis of the 5G Wireless Communication Protocol is\ncrucial for identifying logical vulnerabilities and facilitating an\nall-encompassing security assessment, especially in the design phase. Natural\nLanguage Processing (NLP) assisted techniques and most of the tools are not\nwidely adopted by the industry and research community. Traditional formal\nverification through a mathematics approach heavily relied on manual logical\nabstraction prone to being time-consuming, and error-prone. The reason that the\nNLP-assisted method did not apply in industrial research may be due to the\nambiguity in the natural language of the protocol designs nature is\ncontroversial to the explicitness of formal verification. To address the\nchallenge of adopting the formal methods in protocol designs, targeting (3GPP)\nprotocols that are written in natural language, in this study, we propose a\nhybrid approach to streamline the analysis of protocols. We introduce a\ntwo-step pipeline that first uses NLP tools to construct data and then uses\nconstructed data to extract identifiers and formal properties by using the NLP\nmodel. The identifiers and formal properties are further used for formal\nanalysis. We implemented three models that take different dependencies between\nidentifiers and formal properties as criteria. Our results of the optimal model\nreach valid accuracy of 39% for identifier extraction and 42% for formal\nproperties predictions. Our work is proof of concept for an efficient procedure\nin performing formal analysis for largescale complicate specification and\nprotocol analysis, especially for 5G and nextG communications.\n","authors":["Shiyu Yuan","Jingda Yang","Sudhanshu Arya","Carlo Lipizzi","Ying Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03275v1","updated":"2023-08-07T03:34:01Z","published":"2023-08-07T03:34:01Z","title":"Adapter-based Selective Knowledge Distillation for Federated\n  Multi-domain Meeting Summarization","summary":"  Meeting summarization has emerged as a promising technique for providing\nusers with condensed summaries. However, existing work has focused on training\nmodels on centralized data, neglecting real-world scenarios where meeting data\nare infeasible to collect centrally, due to their sensitive nature. This gap\nmotivates us to explore federated learning for meeting summarization. Two\ncritical challenges impede progress. First, state-of-the-art summarizers are\nbased on parameter-heavy pre-trained models. Exchanging such a model's\nparameters across clients imposes large bandwidth costs. Second, as real-world\nmeeting data belong to various domains and are distributed across clients, they\nare instances of non-identically and independently distributed (non-IID). IID\nassumptions do not hold, which changes which forms of learning algorithms best\napply. To address this, we propose Adapter-based Federated Selective Knowledge\nDistillation (AdaFedSelecKD) for training performant client models.\nSpecifically, we develop an adapter-based summarization model where two\nadapters cooperatively facilitate learning using fewer parameters to reduce\ncommunication costs. Then, we devise a selective knowledge distillation\nstrategy, assisting clients in robustly handling domain-focused modelling on\ntheir own data, while leveraging global parameters based on non-IID data.\nExtensive experiments on the QMSum benchmark demonstrate AdaFedSelecKD can\nachieve comparable performance with powerful centralized training methods, and\nshows its generalizability and robustness.\n","authors":["Xiachong Feng","Xiaocheng Feng","Xiyuan Du","Min-Yen Kan","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2308.03275v1.pdf","comment":"This work has been submitted to the IEEE TASLP for possible\n  publication. Copyright may be transferred without notice, after which this\n  version may no longer be accessible"},{"id":"http://arxiv.org/abs/2103.00676v2","updated":"2023-08-07T03:25:37Z","published":"2021-03-01T01:00:09Z","title":"Token-Modification Adversarial Attacks for Natural Language Processing:\n  A Survey","summary":"  There are now many adversarial attacks for natural language processing\nsystems. Of these, a vast majority achieve success by modifying individual\ndocument tokens, which we call here a token-modification attack. Each\ntoken-modification attack is defined by a specific combination of fundamental\ncomponents, such as a constraint on the adversary or a particular search\nalgorithm. Motivated by this observation, we survey existing token-modification\nattacks and extract the components of each. We use an attack-independent\nframework to structure our survey which results in an effective categorisation\nof the field and an easy comparison of components. This survey aims to guide\nnew researchers to this field and spark further research into individual attack\ncomponents.\n","authors":["Tom Roth","Yansong Gao","Alsharif Abuadbba","Surya Nepal","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2103.00676v2.pdf","comment":"Version 2: updated"},{"id":"http://arxiv.org/abs/2308.03269v1","updated":"2023-08-07T03:19:59Z","published":"2023-08-07T03:19:59Z","title":"Simple Rule Injection for ComplEx Embeddings","summary":"  Recent works in neural knowledge graph inference attempt to combine logic\nrules with knowledge graph embeddings to benefit from prior knowledge. However,\nthey usually cannot avoid rule grounding, and injecting a diverse set of rules\nhas still not been thoroughly explored. In this work, we propose InjEx, a\nmechanism to inject multiple types of rules through simple constraints, which\ncapture definite Horn rules. To start, we theoretically prove that InjEx can\ninject such rules. Next, to demonstrate that InjEx infuses interpretable prior\nknowledge into the embedding space, we evaluate InjEx on both the knowledge\ngraph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.\nOur experimental results reveal that InjEx outperforms both baseline KGC models\nas well as specialized few-shot models while maintaining its scalability and\nefficiency.\n","authors":["Haodi Ma","Anthony Colas","Yuejie Wang","Ali Sadeghian","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03266v1","updated":"2023-08-07T03:12:27Z","published":"2023-08-07T03:12:27Z","title":"SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and\n  Effective Hotword Customization Ability","summary":"  Hotword customization is one of the important issues remained in ASR field -\nit is of value to enable users of ASR systems to customize names of entities,\npersons and other phrases. The past few years have seen both implicit and\nexplicit modeling strategies for ASR contextualization developed. While these\napproaches have performed adequately, they still exhibit certain shortcomings,\nsuch as instability in effectiveness, especially in non-autoregressive ASR\nmodels. In this paper we propose Semantic-augmented Contextual-Paraformer\n(SeACo-Paraformer) a novel NAR based ASR system with flexible and effective\nhotword customization ability. It combines the accuracy of the AED-based model,\nthe efficiency of the NAR model, and the excellent performance in\ncontextualization. In tens of thousands of hours industrial big data\nexperiments, our proposed model outperforms strong baselines in customization\nand general ASR tasks. Besides, we explore an efficient way to filter large\nscale incoming hotwords for further improvement.\n","authors":["Xian Shi","Yexin Yang","Zerui Li","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03266v1.pdf","comment":"early draft"},{"id":"http://arxiv.org/abs/2305.02394v2","updated":"2023-08-07T03:07:59Z","published":"2023-05-03T19:29:26Z","title":"Defending against Insertion-based Textual Backdoor Attacks via\n  Attribution","summary":"  Textual backdoor attack, as a novel attack model, has been shown to be\neffective in adding a backdoor to the model during training. Defending against\nsuch backdoor attacks has become urgent and important. In this paper, we\npropose AttDef, an efficient attribution-based pipeline to defend against two\ninsertion-based poisoning attacks, BadNL and InSent. Specifically, we regard\nthe tokens with larger attribution scores as potential triggers since larger\nattribution words contribute more to the false prediction results and therefore\nare more likely to be poison triggers. Additionally, we further utilize an\nexternal pre-trained language model to distinguish whether input is poisoned or\nnot. We show that our proposed method can generalize sufficiently well in two\ncommon attack scenarios (poisoning training data and testing data), which\nconsistently improves previous methods. For instance, AttDef can successfully\nmitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%\n(3.99% up) under pre-training and post-training attack defense respectively,\nachieving the new state-of-the-art performance on prediction recovery over four\nbenchmark datasets.\n","authors":["Jiazhao Li","Zhuofeng Wu","Wei Ping","Chaowei Xiao","V. G. Vinod Vydiswaran"],"pdf_url":"https://arxiv.org/pdf/2305.02394v2.pdf","comment":"Findings of ACL 2023. Camera-ready version"},{"id":"http://arxiv.org/abs/2212.08632v2","updated":"2023-08-07T03:02:06Z","published":"2022-12-16T18:12:04Z","title":"Enhancing Multi-modal and Multi-hop Question Answering via Structured\n  Knowledge and Unified Retrieval-Generation","summary":"  Multi-modal multi-hop question answering involves answering a question by\nreasoning over multiple input sources from different modalities. Existing\nmethods often retrieve evidences separately and then use a language model to\ngenerate an answer based on the retrieved evidences, and thus do not adequately\nconnect candidates and are unable to model the interdependent relations during\nretrieval. Moreover, the pipelined approaches of retrieval and generation might\nresult in poor generation performance when retrieval performance is low. To\naddress these issues, we propose a Structured Knowledge and Unified\nRetrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion\nEncoder to align sources from different modalities using shared entities. It\nthen uses a unified Retrieval-Generation Decoder to integrate intermediate\nretrieval results for answer generation and also adaptively determine the\nnumber of retrieval steps. Extensive experiments on two representative\nmulti-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG\noutperforms the state-of-the-art models in both source retrieval and answer\ngeneration performance with fewer parameters. Our code is available at\nhttps://github.com/HITsz-TMG/SKURG.\n","authors":["Qian Yang","Qian Chen","Wen Wang","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.08632v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.02180v2","updated":"2023-08-07T02:53:06Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n  Study in Oncology","summary":"  Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zhang","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v2.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n  (MLHC) 2023"},{"id":"http://arxiv.org/abs/2308.03253v1","updated":"2023-08-07T02:18:23Z","published":"2023-08-07T02:18:23Z","title":"PaniniQA: Enhancing Patient Education Through Interactive Question\n  Answering","summary":"  Patient portal allows discharged patients to access their personalized\ndischarge instructions in electronic health records (EHRs). However, many\npatients have difficulty understanding or memorizing their discharge\ninstructions. In this paper, we present PaniniQA, a patient-centric interactive\nquestion answering system designed to help patients understand their discharge\ninstructions. PaniniQA first identifies important clinical content from\npatients' discharge instructions and then formulates patient-specific\neducational questions. In addition, PaniniQA is also equipped with answer\nverification functionality to provide timely feedback to correct patients'\nmisunderstandings. Our comprehensive automatic and human evaluation results\ndemonstrate our PaniniQA is capable of improving patients' mastery of their\nmedical instructions through effective interactions\n","authors":["Pengshan Cai","Zonghai Yao","Fei Liu","Dakuo Wang","Meghan Reilly","Huixue Zhou","Lingxi Li","Yi Cao","Alok Kapoor","Adarsha Bajracharya","Dan Berlowitz","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03253v1.pdf","comment":"Accepted to TACL 2023. This arXiv version is a pre-MIT Press\n  publication version"},{"id":"http://arxiv.org/abs/2308.03235v1","updated":"2023-08-07T01:10:50Z","published":"2023-08-07T01:10:50Z","title":"Analysis of the Evolution of Advanced Transformer-Based Language Models:\n  Experiments on Opinion Mining","summary":"  Opinion mining, also known as sentiment analysis, is a subfield of natural\nlanguage processing (NLP) that focuses on identifying and extracting subjective\ninformation in textual material. This can include determining the overall\nsentiment of a piece of text (e.g., positive or negative), as well as\nidentifying specific emotions or opinions expressed in the text, that involves\nthe use of advanced machine and deep learning techniques. Recently,\ntransformer-based language models make this task of human emotion analysis\nintuitive, thanks to the attention mechanism and parallel computation. These\nadvantages make such models very powerful on linguistic tasks, unlike recurrent\nneural networks that spend a lot of time on sequential processing, making them\nprone to fail when it comes to processing long text. The scope of our paper\naims to study the behaviour of the cutting-edge Transformer-based language\nmodels on opinion mining and provide a high-level comparison between them to\nhighlight their key particularities. Additionally, our comparative study shows\nleads and paves the way for production engineers regarding the approach to\nfocus on and is useful for researchers as it provides guidelines for future\nresearch subjects.\n","authors":["Nour Eddine Zekaoui","Siham Yousfi","Maryem Rhanoui","Mounia Mikram"],"pdf_url":"https://arxiv.org/pdf/2308.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03234v1","updated":"2023-08-07T01:03:04Z","published":"2023-08-07T01:03:04Z","title":"Exploring Automated Distractor and Feedback Generation for Math\n  Multiple-choice Questions via In-context Learning","summary":"  Multiple-choice questions (MCQs) are ubiquitous in almost all levels of\neducation since they are easy to administer, grade, and are a reliable format\nin both assessments and practices. An important aspect of MCQs is the\ndistractors, i.e., incorrect options that are designed to target specific\nmisconceptions or insufficient knowledge among students. To date, the task of\ncrafting high-quality distractors has largely remained a labor-intensive\nprocess for teachers and learning content designers, which has limited\nscalability. In this work, we explore the task of automated distractor and\ncorresponding feedback message generation in math MCQs using large language\nmodels. We establish a formulation of these two tasks and propose a simple,\nin-context learning-based solution. Moreover, we explore using two non-standard\nmetrics to evaluate the quality of the generated distractors and feedback\nmessages. We conduct extensive experiments on these tasks using a real-world\nMCQ dataset that contains student response information. Our findings suggest\nthat there is a lot of room for improvement in automated distractor and\nfeedback generation. We also outline several directions for future work\n","authors":["Hunter McNichols","Wanyong Feng","Jaewook Lee","Alexander Scarlatos","Digory Smith","Simon Woodhead","Andrew Lan"],"pdf_url":"https://arxiv.org/pdf/2308.03234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03958v1","updated":"2023-08-07T23:48:36Z","published":"2023-08-07T23:48:36Z","title":"Simple synthetic data reduces sycophancy in large language models","summary":"  Sycophancy is an undesirable behavior where models tailor their responses to\nfollow a human user's view even when that view is not objectively correct\n(e.g., adapting liberal views once a user reveals that they are liberal). In\nthis paper, we study the prevalence of sycophancy in language models and\npropose a simple synthetic-data intervention to reduce this behavior.\n  First, on a set of three sycophancy tasks (Perez et al., 2022) where models\nare asked for an opinion on statements with no correct answers (e.g.,\npolitics), we observe that both model scaling and instruction tuning\nsignificantly increase sycophancy for PaLM models up to 540B parameters.\nSecond, we extend sycophancy evaluations to simple addition statements that are\nobjectively incorrect, finding that despite knowing that these statements are\nwrong, language models will still agree with them if the user does as well.\n  To reduce sycophancy, we present a straightforward synthetic-data\nintervention that takes public NLP tasks and encourages models to be robust to\nuser opinions on these tasks. Adding these data in a lightweight finetuning\nstep can significantly reduce sycophantic behavior on held-out prompts. Code\nfor generating synthetic data for intervention can be found at\nhttps://github.com/google/sycophancy-intervention.\n","authors":["Jerry Wei","Da Huang","Yifeng Lu","Denny Zhou","Quoc V. Le"],"pdf_url":"https://arxiv.org/pdf/2308.03958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03929v1","updated":"2023-08-07T22:13:30Z","published":"2023-08-07T22:13:30Z","title":"Establishing Trust in ChatGPT BioMedical Generated Text: An\n  Ontology-Based Knowledge Graph to Validate Disease-Symptom Links","summary":"  Methods: Through an innovative approach, we construct ontology-based\nknowledge graphs from authentic medical literature and AI-generated content.\nOur goal is to distinguish factual information from unverified data. We\ncompiled two datasets: one from biomedical literature using a \"human disease\nand symptoms\" query, and another generated by ChatGPT, simulating articles.\nWith these datasets (PubMed and ChatGPT), we curated 10 sets of 250 abstracts\neach, selected randomly with a specific seed. Our method focuses on utilizing\ndisease ontology (DOID) and symptom ontology (SYMP) to build knowledge graphs,\nrobust mathematical models that facilitate unbiased comparisons. By employing\nour fact-checking algorithms and network centrality metrics, we conducted GPT\ndisease-symptoms link analysis to quantify the accuracy of factual knowledge\namid noise, hypotheses, and significant findings.\n  Results: The findings obtained from the comparison of diverse ChatGPT\nknowledge graphs with their PubMed counterparts revealed some interesting\nobservations. While PubMed knowledge graphs exhibit a wealth of disease-symptom\nterms, it is surprising to observe that some ChatGPT graphs surpass them in the\nnumber of connections. Furthermore, some GPT graphs are demonstrating supremacy\nof the centrality scores, especially for the overlapping nodes. This striking\ncontrast indicates the untapped potential of knowledge that can be derived from\nAI-generated content, awaiting verification. Out of all the graphs, the factual\nlink ratio between any two graphs reached its peak at 60%.\n  Conclusions: An intriguing insight from our findings was the striking number\nof links among terms in the knowledge graph generated from ChatGPT datasets,\nsurpassing some of those in its PubMed counterpart. This early discovery has\nprompted further investigation using universal network metrics to unveil the\nnew knowledge the links may hold.\n","authors":["Ahmed Abdeen Hamed","Alessandro Crimi","Magdalena M. Misiak","Byung Suk Lee"],"pdf_url":"https://arxiv.org/pdf/2308.03929v1.pdf","comment":"7 Pages, 3 algorithms, 4 tables, and 7 figures"},{"id":"http://arxiv.org/abs/2308.02013v2","updated":"2023-08-07T21:34:44Z","published":"2023-08-03T20:08:23Z","title":"Federated Representation Learning for Automatic Speech Recognition","summary":"  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge\ndevices to learn collaboratively without sharing data. Edge devices like Alexa\nand Siri are prospective sources of unlabeled audio data that can be tapped to\nlearn robust audio representations. In this work, we bring Self-supervised\nLearning (SSL) and FL together to learn representations for Automatic Speech\nRecognition respecting data privacy constraints. We use the speaker and chapter\ninformation in the unlabeled speech dataset, Libri-Light, to simulate non-IID\nspeaker-siloed data distributions and pre-train an LSTM encoder with the\nContrastive Predictive Coding framework with FedSGD. We show that the\npre-trained ASR encoder in FL performs as well as a centrally pre-trained model\nand produces an improvement of 12-15% (WER) compared to no pre-training. We\nfurther adapt the federated pre-trained models to a new language, French, and\nshow a 20% (WER) improvement over no pre-training.\n","authors":["Guruprasad V Ramesh","Gopinath Chennupati","Milind Rao","Anit Kumar Sahu","Ariya Rastrow","Jasha Droppo"],"pdf_url":"https://arxiv.org/pdf/2308.02013v2.pdf","comment":"Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy\n  in Speech Communication, 2023"},{"id":"http://arxiv.org/abs/2308.03917v1","updated":"2023-08-07T21:29:51Z","published":"2023-08-07T21:29:51Z","title":"Universal Automatic Phonetic Transcription into the International\n  Phonetic Alphabet","summary":"  This paper presents a state-of-the-art model for transcribing speech in any\nlanguage into the International Phonetic Alphabet (IPA). Transcription of\nspoken languages into IPA is an essential yet time-consuming process in\nlanguage documentation, and even partially automating this process has the\npotential to drastically speed up the documentation of endangered languages.\nLike the previous best speech-to-IPA model (Wav2Vec2Phoneme), our model is\nbased on wav2vec 2.0 and is fine-tuned to predict IPA from audio input. We use\ntraining data from seven languages from CommonVoice 11.0, transcribed into IPA\nsemi-automatically. Although this training dataset is much smaller than\nWav2Vec2Phoneme's, its higher quality lets our model achieve comparable or\nbetter results. Furthermore, we show that the quality of our universal\nspeech-to-IPA models is close to that of human annotators.\n","authors":["Chihiro Taguchi","Yusuke Sakai","Parisa Haghani","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2308.03917v1.pdf","comment":"5 pages, 7 tables"},{"id":"http://arxiv.org/abs/2308.03905v1","updated":"2023-08-07T20:43:42Z","published":"2023-08-07T20:43:42Z","title":"Intelligent Assistant Language Understanding On Device","summary":"  It has recently become feasible to run personal digital assistants on phones\nand other personal devices. In this paper we describe a design for a natural\nlanguage understanding system that runs on device. In comparison to a\nserver-based assistant, this system is more private, more reliable, faster,\nmore expressive, and more accurate. We describe what led to key choices about\narchitecture and technologies. For example, some approaches in the dialog\nsystems literature are difficult to maintain over time in a deployment setting.\nWe hope that sharing learnings from our practical experiences may help inform\nfuture work in the research community.\n","authors":["Cecilia Aas","Hisham Abdelsalam","Irina Belousova","Shruti Bhargava","Jianpeng Cheng","Robert Daland","Joris Driesen","Federico Flego","Tristan Guigue","Anders Johannsen","Partha Lal","Jiarui Lu","Joel Ruben Antony Moniz","Nathan Perkins","Dhivya Piraviperumal","Stephen Pulman","Diarmuid Ó Séaghdha","David Q. Sun","John Torr","Marco Del Vecchio","Jay Wacker","Jason D. Williams","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03891v1","updated":"2023-08-07T19:50:59Z","published":"2023-08-07T19:50:59Z","title":"A Cross-Domain Evaluation of Approaches for Causal Knowledge Extraction","summary":"  Causal knowledge extraction is the task of extracting relevant causes and\neffects from text by detecting the causal relation. Although this task is\nimportant for language understanding and knowledge discovery, recent works in\nthis domain have largely focused on binary classification of a text segment as\ncausal or non-causal. In this regard, we perform a thorough analysis of three\nsequence tagging models for causal knowledge extraction and compare it with a\nspan based approach to causality extraction. Our experiments show that\nembeddings from pre-trained language models (e.g. BERT) provide a significant\nperformance boost on this task compared to previous state-of-the-art models\nwith complex architectures. We observe that span based models perform better\nthan simple sequence tagging models based on BERT across all 4 data sets from\ndiverse domains with different types of cause-effect phrases.\n","authors":["Anik Saha","Oktie Hassanzadeh","Alex Gittens","Jian Ni","Kavitha Srinivas","Bulent Yener"],"pdf_url":"https://arxiv.org/pdf/2308.03891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03883v1","updated":"2023-08-07T19:26:09Z","published":"2023-08-07T19:26:09Z","title":"Generative Benchmark Creation for Table Union Search","summary":"  Data management has traditionally relied on synthetic data generators to\ngenerate structured benchmarks, like the TPC suite, where we can control\nimportant parameters like data size and its distribution precisely. These\nbenchmarks were central to the success and adoption of database management\nsystems. But more and more, data management problems are of a semantic nature.\nAn important example is finding tables that can be unioned. While any two\ntables with the same cardinality can be unioned, table union search is the\nproblem of finding tables whose union is semantically coherent. Semantic\nproblems cannot be benchmarked using synthetic data. Our current methods for\ncreating benchmarks involve the manual curation and labeling of real data.\nThese methods are not robust or scalable and perhaps more importantly, it is\nnot clear how robust the created benchmarks are. We propose to use generative\nAI models to create structured data benchmarks for table union search. We\npresent a novel method for using generative models to create tables with\nspecified properties. Using this method, we create a new benchmark containing\npairs of tables that are both unionable and non-unionable but related. We\nthoroughly evaluate recent existing table union search methods over existing\nbenchmarks and our new benchmark. We also present and evaluate a new table\nsearch methods based on recent large language models over all benchmarks. We\nshow that the new benchmark is more challenging for all methods than\nhand-curated benchmarks, specifically, the top-performing method achieves a\nMean Average Precision of around 60%, over 30% less than its performance on\nexisting manually created benchmarks. We examine why this is the case and show\nthat the new benchmark permits more detailed analysis of methods, including a\nstudy of both false positives and false negatives that were not possible with\nexisting benchmarks.\n","authors":["Koyena Pal","Aamod Khatiwada","Roee Shraga","Renée J. Miller"],"pdf_url":"https://arxiv.org/pdf/2308.03883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03869v1","updated":"2023-08-07T18:40:13Z","published":"2023-08-07T18:40:13Z","title":"Semantic Equivalence of e-Commerce Queries","summary":"  Search query variation poses a challenge in e-commerce search, as equivalent\nsearch intents can be expressed through different queries with surface-level\ndifferences. This paper introduces a framework to recognize and leverage query\nequivalence to enhance searcher and business outcomes. The proposed approach\naddresses three key problems: mapping queries to vector representations of\nsearch intent, identifying nearest neighbor queries expressing equivalent or\nsimilar intent, and optimizing for user or business objectives. The framework\nutilizes both surface similarity and behavioral similarity to determine query\nequivalence. Surface similarity involves canonicalizing queries based on word\ninflection, word order, compounding, and noise words. Behavioral similarity\nleverages historical search behavior to generate vector representations of\nquery intent. An offline process is used to train a sentence similarity model,\nwhile an online nearest neighbor approach supports processing of unseen\nqueries. Experimental evaluations demonstrate the effectiveness of the proposed\napproach, outperforming popular sentence transformer models and achieving a\nPearson correlation of 0.85 for query similarity. The results highlight the\npotential of leveraging historical behavior data and training models to\nrecognize and utilize query equivalence in e-commerce search, leading to\nimproved user experiences and business outcomes. Further advancements and\nbenchmark datasets are encouraged to facilitate the development of solutions\nfor this critical problem in the e-commerce domain.\n","authors":["Aritra Mandal","Daniel Tunkelang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03869v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP"},{"id":"http://arxiv.org/abs/2308.03866v1","updated":"2023-08-07T18:27:54Z","published":"2023-08-07T18:27:54Z","title":"Trusting Language Models in Education","summary":"  Language Models are being widely used in Education. Even though modern deep\nlearning models achieve very good performance on question-answering tasks,\nsometimes they make errors. To avoid misleading students by showing wrong\nanswers, it is important to calibrate the confidence - that is, the prediction\nprobability - of these models. In our work, we propose to use an XGBoost on top\nof BERT to output the corrected probabilities, using features based on the\nattention mechanism. Our hypothesis is that the level of uncertainty contained\nin the flow of attention is related to the quality of the model's response\nitself.\n","authors":["Jogi Suda Neto","Li Deng","Thejaswi Raya","Reza Shahbazi","Nick Liu","Adhitya Venkatesh","Miral Shah","Neeru Khosla","Rodrigo Capobianco Guido"],"pdf_url":"https://arxiv.org/pdf/2308.03866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03864v1","updated":"2023-08-07T18:25:00Z","published":"2023-08-07T18:25:00Z","title":"Storyfier: Exploring Vocabulary Learning Support with Text Generation\n  Models","summary":"  Vocabulary learning support tools have widely exploited existing materials,\ne.g., stories or video clips, as contexts to help users memorize each target\nword. However, these tools could not provide a coherent context for any target\nwords of learners' interests, and they seldom help practice word usage. In this\npaper, we work with teachers and students to iteratively develop Storyfier,\nwhich leverages text generation models to enable learners to read a generated\nstory that covers any target words, conduct a story cloze test, and use these\nwords to write a new story with adaptive AI assistance. Our within-subjects\nstudy (N=28) shows that learners generally favor the generated stories for\nconnecting target words and writing assistance for easing their learning\nworkload. However, in the read-cloze-write learning sessions, participants\nusing Storyfier perform worse in recalling and using target words than learning\nwith a baseline tool without our AI features. We discuss insights into\nsupporting learning tasks with generative models.\n","authors":["Zhenhui Peng","Xingbo Wang","Qiushi Han","Junkai Zhu","Xiaojuan Ma","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2308.03864v1.pdf","comment":"To appear at the 2023 ACM Symposium on User Interface Software and\n  Technology (UIST); 16 pages (7 figures, 23 tables)"},{"id":"http://arxiv.org/abs/2308.03853v1","updated":"2023-08-07T18:03:10Z","published":"2023-08-07T18:03:10Z","title":"Extracting detailed oncologic history and treatment plan from medical\n  oncology notes with large language models","summary":"  Both medical care and observational studies in oncology require a thorough\nunderstanding of a patient's disease progression and treatment history, often\nelaborately documented in clinical notes. Despite their vital role, no current\noncology information representation and annotation schema fully encapsulates\nthe diversity of information recorded within these notes. Although large\nlanguage models (LLMs) have recently exhibited impressive performance on\nvarious medical natural language processing tasks, due to the current lack of\ncomprehensively annotated oncology datasets, an extensive evaluation of LLMs in\nextracting and reasoning with the complex rhetoric in oncology notes remains\nunderstudied. We developed a detailed schema for annotating textual oncology\ninformation, encompassing patient characteristics, tumor characteristics,\ntests, treatments, and temporality. Using a corpus of 10 de-identified breast\ncancer progress notes at University of California, San Francisco, we applied\nthis schema to assess the abilities of three recently-released LLMs (GPT-4,\nGPT-3.5-turbo, and FLAN-UL2) to perform zero-shot extraction of detailed\noncological history from two narrative sections of clinical progress notes. Our\nteam annotated 2750 entities, 2874 modifiers, and 1623 relationships. The GPT-4\nmodel exhibited overall best performance, with an average BLEU score of 0.69,\nan average ROUGE score of 0.72, and an average accuracy of 67% on complex tasks\n(expert manual evaluation). Notably, it was proficient in tumor characteristic\nand medication extraction, and demonstrated superior performance in inferring\nsymptoms due to cancer and considerations of future medications. The analysis\ndemonstrates that GPT-4 is potentially already usable to extract important\nfacts from cancer progress notes needed for clinical research, complex\npopulation management, and documenting quality patient care.\n","authors":["Madhumita Sushil","Vanessa E. Kennedy","Brenda Y. Miao","Divneet Mandair","Travis Zack","Atul J. Butte"],"pdf_url":"https://arxiv.org/pdf/2308.03853v1.pdf","comment":"Source code available at:\n  https://github.com/MadhumitaSushil/OncLLMExtraction"},{"id":"http://arxiv.org/abs/2308.03311v1","updated":"2023-08-07T05:40:01Z","published":"2023-08-07T05:40:01Z","title":"CrossTalk: Intelligent Substrates for Language-Oriented Interaction in\n  Video-Based Communication and Collaboration","summary":"  Despite the advances and ubiquity of digital communication media such as\nvideoconferencing and virtual reality, they remain oblivious to the rich\nintentions expressed by users. Beyond transmitting audio, videos, and messages,\nwe envision digital communication media as proactive facilitators that can\nprovide unobtrusive assistance to enhance communication and collaboration.\nInformed by the results of a formative study, we propose three key design\nconcepts to explore the systematic integration of intelligence into\ncommunication and collaboration, including the panel substrate, language-based\nintent recognition, and lightweight interaction techniques. We developed\nCrossTalk, a videoconferencing system that instantiates these concepts, which\nwas found to enable a more fluid and flexible communication and collaboration\nexperience.\n","authors":["Haijun Xia","Tony Wang","Aditya Gunturu","Peiling Jiang","William Duan","Xiaoshuo Yao"],"pdf_url":"https://arxiv.org/pdf/2308.03311v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.03757v1","updated":"2023-08-07T17:59:59Z","published":"2023-08-07T17:59:59Z","title":"3D Motion Magnification: Visualizing Subtle Motions with Time Varying\n  Radiance Fields","summary":"  Motion magnification helps us visualize subtle, imperceptible motion.\nHowever, prior methods only work for 2D videos captured with a fixed camera. We\npresent a 3D motion magnification method that can magnify subtle motions from\nscenes captured by a moving camera, while supporting novel view rendering. We\nrepresent the scene with time-varying radiance fields and leverage the Eulerian\nprinciple for motion magnification to extract and amplify the variation of the\nembedding of a fixed point over time. We study and validate our proposed\nprinciple for 3D motion magnification using both implicit and tri-plane-based\nradiance fields as our underlying 3D scene representation. We evaluate the\neffectiveness of our method on both synthetic and real-world scenes captured\nunder various camera setups.\n","authors":["Brandon Y. Feng","Hadi Alzayer","Michael Rubinstein","William T. Freeman","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03757v1.pdf","comment":"ICCV 2023. See the project page at\n  https://3d-motion-magnification.github.io"},{"id":"http://arxiv.org/abs/2209.11359v4","updated":"2023-08-07T17:59:53Z","published":"2022-09-23T01:09:06Z","title":"CUTS: A Fully Unsupervised Framework for Medical Image Segmentation","summary":"  In this work we introduce CUTS (Contrastive and Unsupervised Training for\nSegmentation), a fully unsupervised deep learning framework for medical image\nsegmentation to better utilize the vast majority of imaging data that is not\nlabeled or annotated. We utilize self-supervision from pixels and their local\nneighborhoods in the images themselves. Our unsupervised approach optimizes a\ntraining objective that leverages concepts from contrastive learning and\nautoencoding. Our framework segments medical images with a novel two-stage\napproach without relying on any labeled data at any stage. The first stage\ninvolves the creation of a \"pixel-centered patch\" that embeds every pixel along\nwith its surrounding patch, using a vector representation in a high-dimensional\nlatent embedding space. The second stage utilizes diffusion condensation, a\nmulti-scale topological data analysis approach, to dynamically coarse-grain\nthese embedding vectors at all levels of granularity. The final outcome is a\nseries of coarse-to-fine segmentations that highlight image structures at\nvarious scales. In this work, we show successful multi-scale segmentation on\nnatural images, retinal fundus images, and brain MRI images. Our framework\ndelineates structures and patterns at different scales which, in the cases of\nmedical images, may carry distinct information relevant to clinical\ninterpretation. Quantitatively, our framework demonstrates improvements ranging\nfrom 10% to 200% on dice coefficient and Hausdorff distance compared to\nexisting unsupervised methods across three medical image datasets. As we tackle\nthe problem of segmenting medical images at multiple meaningful granularities\nwithout relying on any label, we hope to demonstrate the possibility to\ncircumvent tedious and repetitive manual annotations in future practice.\n","authors":["Chen Liu","Matthew Amodio","Liangbo L. Shen","Feng Gao","Arman Avesta","Sanjay Aneja","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2209.11359v4.pdf","comment":"Included new dataset. Ensured evaluation consistency among competing\n  methods"},{"id":"http://arxiv.org/abs/2308.03755v1","updated":"2023-08-07T17:59:48Z","published":"2023-08-07T17:59:48Z","title":"FSD V2: Improving Fully Sparse 3D Object Detection with Virtual Voxels","summary":"  LiDAR-based fully sparse architecture has garnered increasing attention.\nFSDv1 stands out as a representative work, achieving impressive efficacy and\nefficiency, albeit with intricate structures and handcrafted designs. In this\npaper, we present FSDv2, an evolution that aims to simplify the previous FSDv1\nwhile eliminating the inductive bias introduced by its handcrafted\ninstance-level representation, thus promoting better general applicability. To\nthis end, we introduce the concept of \\textbf{virtual voxels}, which takes over\nthe clustering-based instance segmentation in FSDv1. Virtual voxels not only\naddress the notorious issue of the Center Feature Missing problem in fully\nsparse detectors but also endow the framework with a more elegant and\nstreamlined approach. Consequently, we develop a suite of components to\ncomplement the virtual voxel concept, including a virtual voxel encoder, a\nvirtual voxel mixer, and a virtual voxel assignment strategy. Through empirical\nvalidation, we demonstrate that the virtual voxel mechanism is functionally\nsimilar to the handcrafted clustering in FSDv1 while being more general. We\nconduct experiments on three large-scale datasets: Waymo Open Dataset,\nArgoverse 2 dataset, and nuScenes dataset. Our results showcase\nstate-of-the-art performance on all three datasets, highlighting the\nsuperiority of FSDv2 in long-range scenarios and its general applicability to\nachieve competitive performance across diverse scenarios. Moreover, we provide\ncomprehensive experimental analysis to elucidate the workings of FSDv2. To\nfoster reproducibility and further research, we have open-sourced FSDv2 at\nhttps://github.com/tusen-ai/SST.\n","authors":["Lue Fan","Feng Wang","Naiyan Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09027v3","updated":"2023-08-07T17:56:54Z","published":"2022-11-12T10:12:17Z","title":"LLEDA -- Lifelong Self-Supervised Domain Adaptation","summary":"  Humans and animals have the ability to continuously learn new information\nover their lifetime without losing previously acquired knowledge. However,\nartificial neural networks struggle with this due to new information\nconflicting with old knowledge, resulting in catastrophic forgetting. The\ncomplementary learning systems (CLS) theory suggests that the interplay between\nhippocampus and neocortex systems enables long-term and efficient learning in\nthe mammalian brain, with memory replay facilitating the interaction between\nthese two systems to reduce forgetting. The proposed Lifelong Self-Supervised\nDomain Adaptation (LLEDA) framework draws inspiration from the CLS theory and\nmimics the interaction between two networks: a DA network inspired by the\nhippocampus that quickly adjusts to changes in data distribution and an SSL\nnetwork inspired by the neocortex that gradually learns domain-agnostic general\nrepresentations. LLEDA's latent replay technique facilitates communication\nbetween these two networks by reactivating and replaying the past memory latent\nrepresentations to stabilise long-term generalisation and retention without\ninterfering with the previously learned information. Extensive experiments\ndemonstrate that the proposed method outperforms several other methods\nresulting in a long-term adaptation while being less prone to catastrophic\nforgetting when transferred to new domains.\n","authors":["Mamatha Thota","Dewei Yi","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2211.09027v3.pdf","comment":"19 pages, 6 figures, 6 tables; V2 added more experiments on more\n  domains and fixed typos"},{"id":"http://arxiv.org/abs/2308.03747v1","updated":"2023-08-07T17:53:21Z","published":"2023-08-07T17:53:21Z","title":"Mask Frozen-DETR: High Quality Instance Segmentation with One GPU","summary":"  In this paper, we aim to study how to build a strong instance segmenter with\nminimal training time and GPUs, as opposed to the majority of current\napproaches that pursue more accurate instance segmenter by building more\nadvanced frameworks at the cost of longer training time and higher GPU\nrequirements. To achieve this, we introduce a simple and general framework,\ntermed Mask Frozen-DETR, which can convert any existing DETR-based object\ndetection model into a powerful instance segmentation model. Our method only\nrequires training an additional lightweight mask network that predicts instance\nmasks within the bounding boxes given by a frozen DETR-based object detector.\nRemarkably, our method outperforms the state-of-the-art instance segmentation\nmethod Mask DINO in terms of performance on the COCO test-dev split (55.3% vs.\n54.7%) while being over 10X times faster to train. Furthermore, all of our\nexperiments can be trained using only one Tesla V100 GPU with 16 GB of memory,\ndemonstrating the significant efficiency of our proposed framework.\n","authors":["Zhanhao Liang","Yuhui Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.03747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01390v2","updated":"2023-08-07T17:53:09Z","published":"2023-08-02T19:10:23Z","title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive\n  Vision-Language Models","summary":"  We introduce OpenFlamingo, a family of autoregressive vision-language models\nranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce\nan open-source replication of DeepMind's Flamingo models. On seven\nvision-language datasets, OpenFlamingo models average between 80 - 89% of\ncorresponding Flamingo performance. This technical report describes our models,\ntraining data, hyperparameters, and evaluation suite. We share our models and\ncode at https://github.com/mlfoundations/open_flamingo.\n","authors":["Anas Awadalla","Irena Gao","Josh Gardner","Jack Hessel","Yusuf Hanafy","Wanrong Zhu","Kalyani Marathe","Yonatan Bitton","Samir Gadre","Shiori Sagawa","Jenia Jitsev","Simon Kornblith","Pang Wei Koh","Gabriel Ilharco","Mitchell Wortsman","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.01390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2308.03729v1","updated":"2023-08-07T17:17:05Z","published":"2023-08-07T17:17:05Z","title":"Tiny LVLM-eHub: Early Multimodal Experiments with Bard","summary":"  Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated\nsignificant progress in tackling complex multimodal tasks. Among these\ncutting-edge developments, Google's Bard stands out for its remarkable\nmultimodal capabilities, promoting comprehensive comprehension and reasoning\nacross various domains. This work presents an early and holistic evaluation of\nLVLMs' multimodal abilities, with a particular focus on Bard, by proposing a\nlightweight variant of LVLM-eHub, named Tiny LVLM-eHub. In comparison to the\nvanilla version, Tiny LVLM-eHub possesses several appealing properties.\nFirstly, it provides a systematic assessment of six categories of multimodal\ncapabilities, including visual perception, visual knowledge acquisition, visual\nreasoning, visual commonsense, object hallucination, and embodied intelligence,\nthrough quantitative evaluation of $42$ standard text-related visual\nbenchmarks. Secondly, it conducts an in-depth analysis of LVLMs' predictions\nusing the ChatGPT Ensemble Evaluation (CEE), which leads to a robust and\naccurate evaluation and exhibits improved alignment with human evaluation\ncompared to the word matching approach. Thirdly, it comprises a mere $2.1$K\nimage-text pairs, facilitating ease of use for practitioners to evaluate their\nown offline LVLMs. Through extensive experimental analysis, this study\ndemonstrates that Bard outperforms previous LVLMs in most multimodal\ncapabilities except object hallucination, to which Bard is still susceptible.\nTiny LVLM-eHub serves as a baseline evaluation for various LVLMs and encourages\ninnovative strategies aimed at advancing multimodal techniques. Our project is\npublicly available at \\url{https://github.com/OpenGVLab/Multi-Modality-Arena}.\n","authors":["Wenqi Shao","Yutao Hu","Peng Gao","Meng Lei","Kaipeng Zhang","Fanqing Meng","Peng Xu","Siyuan Huang","Hongsheng Li","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.03729v1.pdf","comment":"24 pages, 24 figures, 7 Tables. Project Page:\n  http://lvlm-ehub.opengvlab.com/"},{"id":"http://arxiv.org/abs/2308.03726v1","updated":"2023-08-07T17:12:54Z","published":"2023-08-07T17:12:54Z","title":"AdaptiveSAM: Towards Efficient Tuning of SAM for Surgical Scene\n  Segmentation","summary":"  Segmentation is a fundamental problem in surgical scene analysis using\nartificial intelligence. However, the inherent data scarcity in this domain\nmakes it challenging to adapt traditional segmentation techniques for this\ntask. To tackle this issue, current research employs pretrained models and\nfinetunes them on the given data. Even so, these require training deep networks\nwith millions of parameters every time new data becomes available. A recently\npublished foundation model, Segment-Anything (SAM), generalizes well to a large\nvariety of natural images, hence tackling this challenge to a reasonable\nextent. However, SAM does not generalize well to the medical domain as is\nwithout utilizing a large amount of compute resources for fine-tuning and using\ntask-specific prompts. Moreover, these prompts are in the form of\nbounding-boxes or foreground/background points that need to be annotated\nexplicitly for every image, making this solution increasingly tedious with\nhigher data size. In this work, we propose AdaptiveSAM - an adaptive\nmodification of SAM that can adjust to new datasets quickly and efficiently,\nwhile enabling text-prompted segmentation. For finetuning AdaptiveSAM, we\npropose an approach called bias-tuning that requires a significantly smaller\nnumber of trainable parameters than SAM (less than 2\\%). At the same time,\nAdaptiveSAM requires negligible expert intervention since it uses free-form\ntext as prompt and can segment the object of interest with just the label name\nas prompt. Our experiments show that AdaptiveSAM outperforms current\nstate-of-the-art methods on various medical imaging datasets including surgery,\nultrasound and X-ray. Code is available at\nhttps://github.com/JayParanjape/biastuning\n","authors":["Jay N. Paranjape","Nithin Gopalakrishnan Nair","Shameema Sikder","S. Swaroop Vedula","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2308.03726v1.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.03725v1","updated":"2023-08-07T17:07:48Z","published":"2023-08-07T17:07:48Z","title":"Efficient Temporal Sentence Grounding in Videos with Multi-Teacher\n  Knowledge Distillation","summary":"  Temporal Sentence Grounding in Videos (TSGV) aims to detect the event\ntimestamps described by the natural language query from untrimmed videos. This\npaper discusses the challenge of achieving efficient computation in TSGV models\nwhile maintaining high performance. Most existing approaches exquisitely design\ncomplex architectures to improve accuracy with extra layers and loss, suffering\nfrom inefficiency and heaviness. Although some works have noticed that, they\nonly make an issue of feature fusion layers, which can hardly enjoy the\nhighspeed merit in the whole clunky network. To tackle this problem, we propose\na novel efficient multi-teacher model (EMTM) based on knowledge distillation to\ntransfer diverse knowledge from both heterogeneous and isomorphic networks.\nSpecifically, We first unify different outputs of the heterogeneous models into\none single form. Next, a Knowledge Aggregation Unit (KAU) is built to acquire\nhigh-quality integrated soft labels from multiple teachers. After that, the KAU\nmodule leverages the multi-scale video and global query information to\nadaptively determine the weights of different teachers. A Shared Encoder\nstrategy is then proposed to solve the problem that the student shallow layers\nhardly benefit from teachers, in which an isomorphic teacher is collaboratively\ntrained with the student to align their hidden states. Extensive experimental\nresults on three popular TSGV benchmarks demonstrate that our method is both\neffective and efficient without bells and whistles.\n","authors":["Renjie Liang","Yiming Yang","Hui Lu","Li Li"],"pdf_url":"https://arxiv.org/pdf/2308.03725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03723v1","updated":"2023-08-07T16:58:48Z","published":"2023-08-07T16:58:48Z","title":"Dimensionality Reduction for Improving Out-of-Distribution Detection in\n  Medical Image Segmentation","summary":"  Clinically deployed segmentation models are known to fail on data outside of\ntheir training distribution. As these models perform well on most cases, it is\nimperative to detect out-of-distribution (OOD) images at inference to protect\nagainst automation bias. This work applies the Mahalanobis distance post hoc to\nthe bottleneck features of a Swin UNETR model that segments the liver on\nT1-weighted magnetic resonance imaging. By reducing the dimensions of the\nbottleneck features with principal component analysis, OOD images were detected\nwith high performance and minimal computational load.\n","authors":["McKell Woodland","Nihil Patel","Mais Al Taie","Joshua P. Yung","Tucker J. Netherton","Ankit B. Patel","Kristy K. Brock"],"pdf_url":"https://arxiv.org/pdf/2308.03723v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n  improvements or corrections. The Version of Record of this contribution will\n  be published in the Proceedings of Uncertainty for Safe Utilization of\n  Machine Learning in Medical Imaging (5th International Workshop) - Held in\n  conjunction with MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03718v1","updated":"2023-08-07T16:43:46Z","published":"2023-08-07T16:43:46Z","title":"SEM-GAT: Explainable Semantic Pose Estimation using Learned Graph\n  Attention","summary":"  This paper proposes a GNN-based method for exploiting semantics and local\ngeometry to guide the identification of reliable pointcloud registration\ncandidates. Semantic and morphological features of the environment serve as key\nreference points for registration, enabling accurate lidar-based pose\nestimation. Our novel lightweight static graph structure informs our\nattention-based keypoint node aggregation GNN network by identifying semantic\ninstance-based relationships, acting as inductive bias to significantly reduce\nthe computational burden of pointcloud registration. By connecting candidate\nnodes and exploiting cross-graph attention, we identify confidence scores for\nall potential registration correspondences, estimating the displacement between\npointcloud scans. Our pipeline enables introspective analysis of the model's\nperformance by correlating it with the individual contributions of local\nstructures in the environment, providing valuable insights into the system's\nbehaviour. We test our method on the KITTI odometry dataset, achieving\ncompetitive accuracy compared to benchmark methods and a higher track\nsmoothness while relying on significantly fewer network parameters.\n","authors":["Efimia Panagiotaki","Daniele De Martini","Georgi Pramatarov","Matthew Gadd","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2308.03718v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.03717v1","updated":"2023-08-07T16:40:19Z","published":"2023-08-07T16:40:19Z","title":"Automated Real Time Delineation of Supraclavicular Brachial Plexus in\n  Neck Ultrasonography Videos: A Deep Learning Approach","summary":"  Peripheral nerve blocks are crucial to treatment of post-surgical pain and\nare associated with reduction in perioperative opioid use and hospital stay.\nAccurate interpretation of sono-anatomy is critical for the success of\nultrasound (US) guided peripheral nerve blocks and can be challenging to the\nnew operators. This prospective study enrolled 227 subjects who were\nsystematically scanned for supraclavicular and interscalene brachial plexus in\nvarious settings using three different US machines to create a dataset of 227\nunique videos. In total, 41,000 video frames were annotated by experienced\nanaesthesiologists using partial automation with object tracking and active\ncontour algorithms. Four baseline neural network models were trained on the\ndataset and their performance was evaluated for object detection and\nsegmentation tasks. Generalizability of the best suited model was then tested\non the datasets constructed from separate US scanners with and without\nfine-tuning. The results demonstrate that deep learning models can be leveraged\nfor real time segmentation of supraclavicular brachial plexus in neck\nultrasonography videos with high accuracy and reliability. Model was also\ntested for its ability to differentiate between supraclavicular and adjoining\ninterscalene brachial plexus. The entire dataset has been released publicly for\nfurther study by the research community.\n","authors":["Abhay Tyagi","Abhishek Tyagi","Manpreet Kaur","Jayanthi Sivaswami","Richa Aggarwal","Kapil Dev Soni","Anjan Trikha"],"pdf_url":"https://arxiv.org/pdf/2308.03717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03712v1","updated":"2023-08-07T16:31:38Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n  capacity with human-like visual experience","summary":"  This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach human-level\naccuracy on ImageNet. Human-level competence is thus achievable for a\nfundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v1.pdf","comment":"7 pages, 3 figures, 2 tables; code & models available from\n  https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2308.03709v1","updated":"2023-08-07T16:30:24Z","published":"2023-08-07T16:30:24Z","title":"Prototype Learning for Out-of-Distribution Polyp Segmentation","summary":"  Existing polyp segmentation models from colonoscopy images often fail to\nprovide reliable segmentation results on datasets from different centers,\nlimiting their applicability. Our objective in this study is to create a robust\nand well-generalized segmentation model named PrototypeLab that can assist in\npolyp segmentation. To achieve this, we incorporate various lighting modes such\nas White light imaging (WLI), Blue light imaging (BLI), Linked color imaging\n(LCI), and Flexible spectral imaging color enhancement (FICE) into our new\nsegmentation model, that learns to create prototypes for each class of object\npresent in the images. These prototypes represent the characteristic features\nof the objects, such as their shape, texture, color. Our model is designed to\nperform effectively on out-of-distribution (OOD) datasets from multiple\ncenters. We first generate a coarse mask that is used to learn prototypes for\nthe main object class, which are then employed to generate the final\nsegmentation mask. By using prototypes to represent the main class, our\napproach handles the variability present in the medical images and generalize\nwell to new data since prototype capture the underlying distribution of the\ndata. PrototypeLab offers a promising solution with a dice coefficient of\n$\\geq$ 90\\% and mIoU $\\geq$ 85\\% with a near real-time processing speed for\npolyp segmentation. It achieved superior performance on OOD datasets compared\nto 16 state-of-the-art image segmentation architectures, potentially improving\nclinical outcomes. Codes are available at\nhttps://github.com/xxxxx/PrototypeLab.\n","authors":["Nikhil Kumar Tomar","Debesh Jha","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2308.03709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03703v1","updated":"2023-08-07T16:22:47Z","published":"2023-08-07T16:22:47Z","title":"Video-based Person Re-identification with Long Short-Term Representation\n  Learning","summary":"  Video-based person Re-Identification (V-ReID) aims to retrieve specific\npersons from raw videos captured by non-overlapped cameras. As a fundamental\ntask, it spreads many multimedia and computer vision applications. However, due\nto the variations of persons and scenes, there are still many obstacles that\nmust be overcome for high performance. In this work, we notice that both the\nlong-term and short-term information of persons are important for robust video\nrepresentations. Thus, we propose a novel deep learning framework named Long\nShort-Term Representation Learning (LSTRL) for effective V-ReID. More\nspecifically, to extract long-term representations, we propose a\nMulti-granularity Appearance Extractor (MAE), in which four granularity\nappearances are effectively captured across multiple frames. Meanwhile, to\nextract short-term representations, we propose a Bi-direction Motion Estimator\n(BME), in which reciprocal motion information is efficiently extracted from\nconsecutive frames. The MAE and BME are plug-and-play and can be easily\ninserted into existing networks for efficient feature learning. As a result,\nthey significantly improve the feature representation ability for V-ReID.\nExtensive experiments on three widely used benchmarks show that our proposed\napproach can deliver better performances than most state-of-the-arts.\n","authors":["Xuehu Liu","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03703v1.pdf","comment":"This work is accepted by ICIG2023, including 13 pages, 5 figures and\n  5 tables. Modifications may be performed for further improvements"},{"id":"http://arxiv.org/abs/2308.03698v1","updated":"2023-08-07T16:14:27Z","published":"2023-08-07T16:14:27Z","title":"Screen-based 3D Subjective Experiment Software","summary":"  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn\nconsiderable efforts from academia and industry to assess their perceptual\nquality by conducting subjective experiments. However, lacking a handy software\nfor 3D subjective experiments complicates the construction of 3D graphics\nquality assessment datasets, thus hindering the prosperity of relevant fields.\nIn this paper, we develop a powerful platform with which users can flexibly\ndesign their 3D subjective methodologies and build high-quality datasets,\neasing a broad spectrum of 3D graphics subjective quality study. To accurately\nillustrate the perceptual quality differences of 3D stimuli, our software can\nsimultaneously render the source stimulus and impaired stimulus and allows both\nstimuli to respond synchronously to viewer interactions. Compared with amateur\n3D visualization tool-based or image/video rendering-based schemes, our\napproach embodies typical 3D applications while minimizing cognitive overload\nduring subjective experiments. We organized a subjective experiment involving\n40 participants to verify the validity of the proposed software. Experimental\nanalyses demonstrate that subjective tests on our software can produce\nreasonable subjective quality scores of 3D models. All resources in this paper\ncan be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.\n","authors":["Songlin Fan","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2308.03698v1.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.03685v1","updated":"2023-08-07T16:00:22Z","published":"2023-08-07T16:00:22Z","title":"Learning Concise and Descriptive Attributes for Visual Recognition","summary":"  Recent advances in foundation models present new opportunities for\ninterpretable visual recognition -- one can first query Large Language Models\n(LLMs) to obtain a set of attributes that describe each class, then apply\nvision-language models to classify images via these attributes. Pioneering work\nshows that querying thousands of attributes can achieve performance competitive\nwith image features. However, our further investigation on 8 datasets reveals\nthat LLM-generated attributes in a large quantity perform almost the same as\nrandom words. This surprising finding suggests that significant noise may be\npresent in these attributes. We hypothesize that there exist subsets of\nattributes that can maintain the classification performance with much smaller\nsizes, and propose a novel learning-to-search method to discover those concise\nsets of attributes. As a result, on the CUB dataset, our method achieves\nperformance close to that of massive LLM-generated attributes (e.g., 10k\nattributes for CUB), yet using only 32 attributes in total to distinguish 200\nbird species. Furthermore, our new paradigm demonstrates several additional\nbenefits: higher interpretability and interactivity for humans, and the ability\nto summarize knowledge for a recognition task.\n","authors":["An Yan","Yu Wang","Yiwu Zhong","Chengyu Dong","Zexue He","Yujie Lu","William Wang","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2308.03685v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03670v1","updated":"2023-08-07T15:44:58Z","published":"2023-08-07T15:44:58Z","title":"Improving FHB Screening in Wheat Breeding Using an Efficient Transformer\n  Model","summary":"  Fusarium head blight is a devastating disease that causes significant\neconomic losses annually on small grains. Efficiency, accuracy, and timely\ndetection of FHB in the resistance screening are critical for wheat and barley\nbreeding programs. In recent years, various image processing techniques have\nbeen developed using supervised machine learning algorithms for the early\ndetection of FHB. The state-of-the-art convolutional neural network-based\nmethods, such as U-Net, employ a series of encoding blocks to create a local\nrepresentation and a series of decoding blocks to capture the semantic\nrelations. However, these methods are not often capable of long-range modeling\ndependencies inside the input data, and their ability to model multi-scale\nobjects with significant variations in texture and shape is limited. Vision\ntransformers as alternative architectures with innate global self-attention\nmechanisms for sequence-to-sequence prediction, due to insufficient low-level\ndetails, may also limit localization capabilities. To overcome these\nlimitations, a new Context Bridge is proposed to integrate the local\nrepresentation capability of the U-Net network in the transformer model. In\naddition, the standard attention mechanism of the original transformer is\nreplaced with Efficient Self-attention, which is less complicated than other\nstate-of-the-art methods. To train the proposed network, 12,000 wheat images\nfrom an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were\ncaptured. In addition to healthy and unhealthy plants, these images encompass\nvarious stages of the disease. A team of expert pathologists annotated the\nimages for training and evaluating the developed model. As a result, the\neffectiveness of the transformer-based method for FHB-disease detection,\nthrough extensive experiments across typical tasks for plant image\nsegmentation, is demonstrated.\n","authors":["Babak Azad","Ahmed Abdalla","Kwanghee Won","Ali Mirzakhani Nafchi"],"pdf_url":"https://arxiv.org/pdf/2308.03670v1.pdf","comment":"10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual\n  International Meeting conference in Omaha, Nebraska. Also available at\n  https://elibrary.asabe.org/abstract.asp?aid=54149"},{"id":"http://arxiv.org/abs/2307.16177v2","updated":"2023-08-07T15:22:37Z","published":"2023-07-30T09:15:38Z","title":"Fusing VHR Post-disaster Aerial Imagery and LiDAR Data for Roof\n  Classification in the Caribbean using CNNs","summary":"  Accurate and up-to-date information on building characteristics is essential\nfor vulnerability assessment; however, the high costs and long timeframes\nassociated with conducting traditional field surveys can be an obstacle to\nobtaining critical exposure datasets needed for disaster risk management. In\nthis work, we leverage deep learning techniques for the automated\nclassification of roof characteristics from very high-resolution orthophotos\nand airborne LiDAR data obtained in Dominica following Hurricane Maria in 2017.\nWe demonstrate that the fusion of multimodal earth observation data performs\nbetter than using any single data source alone. Using our proposed methods, we\nachieve F1 scores of 0.93 and 0.92 for roof type and roof material\nclassification, respectively. This work is intended to help governments produce\nmore timely building information to improve resilience and disaster response in\nthe Caribbean.\n","authors":["Isabelle Tingzon","Nuala Margaret Cowan","Pierre Chrzanowski"],"pdf_url":"https://arxiv.org/pdf/2307.16177v2.pdf","comment":"2023 ICCV Humanitarian Assistance and Disaster Response Workshop"},{"id":"http://arxiv.org/abs/2308.03654v1","updated":"2023-08-07T15:10:21Z","published":"2023-08-07T15:10:21Z","title":"FFF: Fragments-Guided Flexible Fitting for Building Complete Protein\n  Structures","summary":"  Cryo-electron microscopy (cryo-EM) is a technique for reconstructing the\n3-dimensional (3D) structure of biomolecules (especially large protein\ncomplexes and molecular assemblies). As the resolution increases to the\nnear-atomic scale, building protein structures de novo from cryo-EM maps\nbecomes possible. Recently, recognition-based de novo building methods have\nshown the potential to streamline this process. However, it cannot build a\ncomplete structure due to the low signal-to-noise ratio (SNR) problem. At the\nsame time, AlphaFold has led to a great breakthrough in predicting protein\nstructures. This has inspired us to combine fragment recognition and structure\nprediction methods to build a complete structure. In this paper, we propose a\nnew method named FFF that bridges protein structure prediction and protein\nstructure recognition with flexible fitting. First, a multi-level recognition\nnetwork is used to capture various structural features from the input 3D\ncryo-EM map. Next, protein structural fragments are generated using pseudo\npeptide vectors and a protein sequence alignment method based on these\nextracted features. Finally, a complete structural model is constructed using\nthe predicted protein fragments via flexible fitting. Based on our benchmark\ntests, FFF outperforms the baseline methods for building complete protein\nstructures.\n","authors":["Weijie Chen","Xinyan Wang","Yuhang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03654v1.pdf","comment":"Published in the Proceedings of the IEEE/CVF Conference on Computer\n  Vision and Pattern Recognition (CVPR), 2023"},{"id":"http://arxiv.org/abs/2308.03652v1","updated":"2023-08-07T15:07:21Z","published":"2023-08-07T15:07:21Z","title":"WarpEM: Dynamic Time Warping for Accurate Catheter Registration in\n  EM-guided Procedures","summary":"  Accurate catheter tracking is crucial during minimally invasive endovascular\nprocedures (MIEP), and electromagnetic (EM) tracking is a widely used\ntechnology that serves this purpose. However, registration between preoperative\nimages and the EM tracking system is often challenging. Existing registration\nmethods typically require manual interactions, which can be time-consuming,\nincrease the risk of errors and change the procedural workflow. Although\nseveral registration methods are available for catheter tracking, such as\nmarker-based and path-based approaches, their limitations can impact the\naccuracy of the resulting tracking solution, consequently, the outcome of the\nmedical procedure.\n  This paper introduces a novel automated catheter registration method for\nEM-guided MIEP. The method utilizes 3D signal temporal analysis, such as\nDynamic Time Warping (DTW) algorithms, to improve registration accuracy and\nreliability compared to existing methods. DTW can accurately warp and match\nEM-tracked paths to the vessel's centerline, making it particularly suitable\nfor registration. The introduced registration method is evaluated for accuracy\nin a vascular phantom using a marker-based registration as the ground truth.\nThe results indicate that the DTW method yields accurate and reliable\nregistration outcomes, with a mean error of $2.22$mm. The introduced\nregistration method presents several advantages over state-of-the-art methods,\nsuch as high registration accuracy, no initialization required, and increased\nautomation.\n","authors":["Ardit Ramadani","Peter Ewert","Heribert Schunkert","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2308.03652v1.pdf","comment":"The 26th International Conference on Medical Image Computing and\n  Computer Assisted Intervention, MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03631v1","updated":"2023-08-07T14:36:49Z","published":"2023-08-07T14:36:49Z","title":"Segmentation Framework for Heat Loss Identification in Thermal Images:\n  Empowering Scottish Retrofitting and Thermographic Survey Companies","summary":"  Retrofitting and thermographic survey (TS) companies in Scotland collaborate\nwith social housing providers to tackle fuel poverty. They employ ground-level\ninfrared (IR) camera-based-TSs (GIRTSs) for collecting thermal images to\nidenti-fy the heat loss sources resulting from poor insulation. However, this\nidentifica-tion process is labor-intensive and time-consuming, necessitating\nextensive data processing. To automate this, an AI-driven approach is\nnecessary. Therefore, this study proposes a deep learning (DL)-based\nsegmentation framework using the Mask Region Proposal Convolutional Neural\nNetwork (Mask RCNN) to validate its applicability to these thermal images. The\nobjective of the framework is to au-tomatically identify, and crop heat loss\nsources caused by weak insulation, while also eliminating obstructive objects\npresent in those images. By doing so, it min-imizes labor-intensive tasks and\nprovides an automated, consistent, and reliable solution. To validate the\nproposed framework, approximately 2500 thermal imag-es were collected in\ncollaboration with industrial TS partner. Then, 1800 repre-sentative images\nwere carefully selected with the assistance of experts and anno-tated to\nhighlight the target objects (TO) to form the final dataset. Subsequently, a\ntransfer learning strategy was employed to train the dataset, progressively\naug-menting the training data volume and fine-tuning the pre-trained baseline\nMask RCNN. As a result, the final fine-tuned model achieved a mean average\nprecision (mAP) score of 77.2% for segmenting the TO, demonstrating the\nsignificant po-tential of proposed framework in accurately quantifying energy\nloss in Scottish homes.\n","authors":["Md Junayed Hasan","Eyad Elyan","Yijun Yan","Jinchang Ren","Md Mostafa Kamal Sarker"],"pdf_url":"https://arxiv.org/pdf/2308.03631v1.pdf","comment":"9 Pages, 3 Figures, Accepted from the conference - BICS 2023: 2023\n  International Conference on Brain-Inspired Cognitive Systems Kuala Lumpur,\n  Malaysia, August 5-6, 2023 [peer-reviewed]"},{"id":"http://arxiv.org/abs/2308.03624v1","updated":"2023-08-07T14:31:07Z","published":"2023-08-07T14:31:07Z","title":"MOMA-Force: Visual-Force Imitation for Real-World Mobile Manipulation","summary":"  In this paper, we present a novel method for mobile manipulators to perform\nmultiple contact-rich manipulation tasks. While learning-based methods have the\npotential to generate actions in an end-to-end manner, they often suffer from\ninsufficient action accuracy and robustness against noise. On the other hand,\nclassical control-based methods can enhance system robustness, but at the cost\nof extensive parameter tuning. To address these challenges, we present\nMOMA-Force, a visual-force imitation method that seamlessly combines\nrepresentation learning for perception, imitation learning for complex motion\ngeneration, and admittance whole-body control for system robustness and\ncontrollability. MOMA-Force enables a mobile manipulator to learn multiple\ncomplex contact-rich tasks with high success rates and small contact forces. In\na real household setting, our method outperforms baseline methods in terms of\ntask success rates. Moreover, our method achieves smaller contact forces and\nsmaller force variances compared to baseline methods without force imitation.\nOverall, we offer a promising approach for efficient and robust mobile\nmanipulation in the real world. Videos and more details can be found on\n\\url{https://visual-force-imitation.github.io}\n","authors":["Taozheng Yang","Ya Jing","Hongtao Wu","Jiafeng Xu","Kuankuan Sima","Guangzeng Chen","Qie Sima","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2308.03624v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n  (IROS), 2023"},{"id":"http://arxiv.org/abs/2308.03620v1","updated":"2023-08-07T14:24:52Z","published":"2023-08-07T14:24:52Z","title":"Exploring Visual Pre-training for Robot Manipulation: Datasets, Models\n  and Methods","summary":"  Visual pre-training with large-scale real-world data has made great progress\nin recent years, showing great potential in robot learning with pixel\nobservations. However, the recipes of visual pre-training for robot\nmanipulation tasks are yet to be built. In this paper, we thoroughly\ninvestigate the effects of visual pre-training strategies on robot manipulation\ntasks from three fundamental perspectives: pre-training datasets, model\narchitectures and training methods. Several significant experimental findings\nare provided that are beneficial for robot learning. Further, we propose a\nvisual pre-training scheme for robot manipulation termed Vi-PRoM, which\ncombines self-supervised learning and supervised learning. Concretely, the\nformer employs contrastive learning to acquire underlying patterns from\nlarge-scale unlabeled data, while the latter aims learning visual semantics and\ntemporal dynamics. Extensive experiments on robot manipulations in various\nsimulation environments and the real robot demonstrate the superiority of the\nproposed scheme. Videos and more details can be found on\n\\url{https://explore-pretrain-robot.github.io}.\n","authors":["Ya Jing","Xuelin Zhu","Xingbin Liu","Qie Sima","Taozheng Yang","Yunhai Feng","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2308.03620v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n  (IROS), 2023"},{"id":"http://arxiv.org/abs/2308.03613v1","updated":"2023-08-07T14:16:52Z","published":"2023-08-07T14:16:52Z","title":"Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous\n  Labels","summary":"  Accurate segmentation of brain vessels is crucial for cerebrovascular disease\ndiagnosis and treatment. However, existing methods face challenges in capturing\nsmall vessels and handling datasets that are partially or ambiguously\nannotated. In this paper, we propose an adaptive semi-supervised approach to\naddress these challenges. Our approach incorporates innovative techniques\nincluding progressive semi-supervised learning, adaptative training strategy,\nand boundary enhancement. Experimental results on 3DRA datasets demonstrate the\nsuperiority of our method in terms of mesh-based segmentation metrics. By\nleveraging the partially and ambiguously labeled data, which only annotates the\nmain vessels, our method achieves impressive segmentation performance on\nmislabeled fine vessels, showcasing its potential for clinical applications.\n","authors":["Fengming Lin","Yan Xia","Nishant Ravikumar","Qiongyao Liu","Michael MacRaild","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2308.03613v1.pdf","comment":"Accepted by DALI MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03610v1","updated":"2023-08-07T14:09:46Z","published":"2023-08-07T14:09:46Z","title":"AvatarVerse: High-quality & Stable 3D Avatar Creation from Text and Pose","summary":"  Creating expressive, diverse and high-quality 3D avatars from highly\ncustomized text descriptions and pose guidance is a challenging task, due to\nthe intricacy of modeling and texturing in 3D that ensure details and various\nstyles (realistic, fictional, etc). We present AvatarVerse, a stable pipeline\nfor generating expressive high-quality 3D avatars from nothing but text\ndescriptions and pose guidance. In specific, we introduce a 2D diffusion model\nconditioned on DensePose signal to establish 3D pose control of avatars through\n2D images, which enhances view consistency from partially observed scenarios.\nIt addresses the infamous Janus Problem and significantly stablizes the\ngeneration process. Moreover, we propose a progressive high-resolution 3D\nsynthesis strategy, which obtains substantial improvement over the quality of\nthe created 3D avatars. To this end, the proposed AvatarVerse pipeline achieves\nzero-shot 3D modeling of 3D avatars that are not only more expressive, but also\nin higher quality and fidelity than previous works. Rigorous qualitative\nevaluations and user studies showcase AvatarVerse's superiority in synthesizing\nhigh-fidelity 3D avatars, leading to a new standard in high-quality and stable\n3D avatar creation. Our project page is: https://avatarverse3d.github.io\n","authors":["Huichao Zhang","Bowen Chen","Hao Yang","Liao Qu","Xu Wang","Li Chen","Chao Long","Feida Zhu","Kang Du","Min Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.03610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03608v1","updated":"2023-08-07T14:09:08Z","published":"2023-08-07T14:09:08Z","title":"Recurrent Self-Supervised Video Denoising with Denser Receptive Field","summary":"  Self-supervised video denoising has seen decent progress through the use of\nblind spot networks. However, under their blind spot constraints, previous\nself-supervised video denoising methods suffer from significant information\nloss and texture destruction in either the whole reference frame or neighbor\nframes, due to their inadequate consideration of the receptive field. Moreover,\nthe limited number of available neighbor frames in previous methods leads to\nthe discarding of distant temporal information. Nonetheless, simply adopting\nexisting recurrent frameworks does not work, since they easily break the\nconstraints on the receptive field imposed by self-supervision. In this paper,\nwe propose RDRF for self-supervised video denoising, which not only fully\nexploits both the reference and neighbor frames with a denser receptive field,\nbut also better leverages the temporal information from both local and distant\nneighbor features. First, towards a comprehensive utilization of information\nfrom both reference and neighbor frames, RDRF realizes a denser receptive field\nby taking more neighbor pixels along the spatial and temporal dimensions.\nSecond, it features a self-supervised recurrent video denoising framework,\nwhich concurrently integrates distant and near-neighbor temporal features. This\nenables long-term bidirectional information aggregation, while mitigating error\naccumulation in the plain recurrent framework. Our method exhibits superior\nperformance on both synthetic and real video denoising datasets. Codes will be\navailable at https://github.com/Wang-XIaoDingdd/RDRF.\n","authors":["Zichun Wang","Yulun Zhang","Debing Zhang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2308.03608v1.pdf","comment":"Accepted to ACMMM 2023"},{"id":"http://arxiv.org/abs/2303.14643v2","updated":"2023-08-07T14:08:44Z","published":"2023-03-26T06:59:23Z","title":"POAR: Towards Open Vocabulary Pedestrian Attribute Recognition","summary":"  Pedestrian attribute recognition (PAR) aims to predict the attributes of a\ntarget pedestrian in a surveillance system. Existing methods address the PAR\nproblem by training a multi-label classifier with predefined attribute classes.\nHowever, it is impossible to exhaust all pedestrian attributes in the real\nworld. To tackle this problem, we develop a novel pedestrian open-attribute\nrecognition (POAR) framework. Our key idea is to formulate the POAR problem as\nan image-text search problem. We design a Transformer-based image encoder with\na masking strategy. A set of attribute tokens are introduced to focus on\nspecific pedestrian parts (e.g., head, upper body, lower body, feet, etc.) and\nencode corresponding attributes into visual embeddings. Each attribute category\nis described as a natural language sentence and encoded by the text encoder.\nThen, we compute the similarity between the visual and text embeddings of\nattributes to find the best attribute descriptions for the input images.\nDifferent from existing methods that learn a specific classifier for each\nattribute category, we model the pedestrian at a part-level and explore the\nsearching method to handle the unseen attributes. Finally, a many-to-many\ncontrastive (MTMC) loss with masked tokens is proposed to train the network\nsince a pedestrian image can comprise multiple attributes. Extensive\nexperiments have been conducted on benchmark PAR datasets with an\nopen-attribute setting. The results verified the effectiveness of the proposed\nPOAR method, which can form a strong baseline for the POAR task. Our code is\navailable at \\url{https://github.com/IvyYZ/POAR}.\n","authors":["Yue Zhang","Suchen Wang","Shichao Kan","Zhenyu Weng","Yigang Cen","Yap-peng Tan"],"pdf_url":"https://arxiv.org/pdf/2303.14643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03594v1","updated":"2023-08-07T13:52:21Z","published":"2023-08-07T13:52:21Z","title":"FeatEnHancer: Enhancing Hierarchical Features for Object Detection and\n  Beyond Under Low-Light Vision","summary":"  Extracting useful visual cues for the downstream tasks is especially\nchallenging under low-light vision. Prior works create enhanced representations\nby either correlating visual quality with machine perception or designing\nillumination-degrading transformation methods that require pre-training on\nsynthetic datasets. We argue that optimizing enhanced image representation\npertaining to the loss of the downstream task can result in more expressive\nrepresentations. Therefore, in this work, we propose a novel module,\nFeatEnHancer, that hierarchically combines multiscale features using\nmultiheaded attention guided by task-related loss function to create suitable\nrepresentations. Furthermore, our intra-scale enhancement improves the quality\nof features extracted at each scale or level, as well as combines features from\ndifferent scales in a way that reflects their relative importance for the task\nat hand. FeatEnHancer is a general-purpose plug-and-play module and can be\nincorporated into any low-light vision pipeline. We show with extensive\nexperimentation that the enhanced representation produced with FeatEnHancer\nsignificantly and consistently improves results in several low-light vision\ntasks, including dark object detection (+5.7 mAP on ExDark), face detection\n(+1.5 mAPon DARK FACE), nighttime semantic segmentation (+5.1 mIoU on ACDC ),\nand video object detection (+1.8 mAP on DarkVision), highlighting the\neffectiveness of enhancing hierarchical features under low-light vision.\n","authors":["Khurram Azeem Hashmi","Goutham Kallempudi","Didier Stricker","Muhammamd Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2308.03594v1.pdf","comment":"19 pages, 9 Figures, and 10 Tables. Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.03586v1","updated":"2023-08-07T13:44:44Z","published":"2023-08-07T13:44:44Z","title":"SoilNet: An Attention-based Spatio-temporal Deep Learning Framework for\n  Soil Organic Carbon Prediction with Digital Soil Mapping in Europe","summary":"  Digital soil mapping (DSM) is an advanced approach that integrates\nstatistical modeling and cutting-edge technologies, including machine learning\n(ML) methods, to accurately depict soil properties and their spatial\ndistribution. Soil organic carbon (SOC) is a crucial soil attribute providing\nvaluable insights into soil health, nutrient cycling, greenhouse gas emissions,\nand overall ecosystem productivity. This study highlights the significance of\nspatial-temporal deep learning (DL) techniques within the DSM framework. A\nnovel architecture is proposed, incorporating spatial information using a base\nconvolutional neural network (CNN) model and spatial attention mechanism, along\nwith climate temporal information using a long short-term memory (LSTM)\nnetwork, for SOC prediction across Europe. The model utilizes a comprehensive\nset of environmental features, including Landsat-8 images, topography, remote\nsensing indices, and climate time series, as input features. Results\ndemonstrate that the proposed framework outperforms conventional ML approaches\nlike random forest commonly used in DSM, yielding lower root mean square error\n(RMSE). This model is a robust tool for predicting SOC and could be applied to\nother soil properties, thereby contributing to the advancement of DSM\ntechniques and facilitating land management and decision-making processes based\non accurate information.\n","authors":["Nafiseh Kakhani","Moien Rangzan","Ali Jamali","Sara Attarchi","Seyed Kazem Alavipanah","Thomas Scholten"],"pdf_url":"https://arxiv.org/pdf/2308.03586v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.03580v1","updated":"2023-08-07T13:35:53Z","published":"2023-08-07T13:35:53Z","title":"Revealing the Underlying Patterns: Investigating Dataset Similarity,\n  Performance, and Generalization","summary":"  Supervised deep learning models require significant amount of labelled data\nto achieve an acceptable performance on a specific task. However, when tested\non unseen data, the models may not perform well. Therefore, the models need to\nbe trained with additional and varying labelled data to improve the\ngeneralization. In this work, our goal is to understand the models, their\nperformance and generalization. We establish image-image, dataset-dataset, and\nimage-dataset distances to gain insights into the model's behavior. Our\nproposed distance metric when combined with model performance can help in\nselecting an appropriate model/architecture from a pool of candidate\narchitectures. We have shown that the generalization of these models can be\nimproved by only adding a small number of unseen images (say 1, 3 or 7) into\nthe training set. Our proposed approach reduces training and annotation costs\nwhile providing an estimate of model performance on unseen data in dynamic\nenvironments.\n","authors":["Akshit Achara","Ram Krishna Pandey"],"pdf_url":"https://arxiv.org/pdf/2308.03580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08083v4","updated":"2023-08-07T13:24:06Z","published":"2022-06-16T10:53:18Z","title":"CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation\n  from Simulation to multiple Real-World Domains","summary":"  Unsupervised Domain Adaptation demonstrates great potential to mitigate\ndomain shifts by transferring models from labeled source domains to unlabeled\ntarget domains. While Unsupervised Domain Adaptation has been applied to a wide\nvariety of complex vision tasks, only few works focus on lane detection for\nautonomous driving. This can be attributed to the lack of publicly available\ndatasets. To facilitate research in these directions, we propose CARLANE, a\n3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE\nencompasses the single-target datasets MoLane and TuLane and the multi-target\ndataset MuLane. These datasets are built from three different domains, which\ncover diverse scenes and contain a total of 163K unique images, 118K of which\nare annotated. In addition we evaluate and report systematic baselines,\nincluding our own method, which builds upon Prototypical Cross-domain\nSelf-supervised Learning. We find that false positive and false negative rates\nof the evaluated domain adaptation methods are high compared to those of fully\nsupervised baselines. This affirms the need for benchmarks such as CARLANE to\nfurther strengthen research in Unsupervised Domain Adaptation for lane\ndetection. CARLANE, all evaluated models and the corresponding implementations\nare publicly available at https://carlanebenchmark.github.io.\n","authors":["Julian Gebele","Bonifaz Stuhr","Johann Haselberger"],"pdf_url":"https://arxiv.org/pdf/2206.08083v4.pdf","comment":"36th Conference on Neural Information Processing Systems (NeurIPS\n  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.09534v2","updated":"2023-08-07T13:00:47Z","published":"2023-04-19T09:52:50Z","title":"Realistic Data Enrichment for Robust Image Segmentation in\n  Histopathology","summary":"  Poor performance of quantitative analysis in histopathological Whole Slide\nImages (WSI) has been a significant obstacle in clinical practice. Annotating\nlarge-scale WSIs manually is a demanding and time-consuming task, unlikely to\nyield the expected results when used for fully supervised learning systems.\nRarely observed disease patterns and large differences in object scales are\ndifficult to model through conventional patient intake. Prior methods either\nfall back to direct disease classification, which only requires learning a few\nfactors per image, or report on average image segmentation performance, which\nis highly biased towards majority observations. Geometric image augmentation is\ncommonly used to improve robustness for average case predictions and to enrich\nlimited datasets. So far no method provided sampling of a realistic posterior\ndistribution to improve stability, e.g. for the segmentation of imbalanced\nobjects within images. Therefore, we propose a new approach, based on diffusion\nmodels, which can enrich an imbalanced dataset with plausible examples from\nunderrepresented groups by conditioning on segmentation maps. Our method can\nsimply expand limited clinical datasets making them suitable to train machine\nlearning pipelines, and provides an interpretable and human-controllable way of\ngenerating histopathology images that are indistinguishable from real ones to\nhuman experts. We validate our findings on two datasets, one from the public\ndomain and one from a Kidney Transplant study.\n","authors":["Sarah Cechnicka","James Ball","Hadrien Reynaud","Callum Arthurs","Candice Roufosse","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2304.09534v2.pdf","comment":"11 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.03529v1","updated":"2023-08-07T12:26:34Z","published":"2023-08-07T12:26:34Z","title":"Feature Decoupling-Recycling Network for Fast Interactive Segmentation","summary":"  Recent interactive segmentation methods iteratively take source image, user\nguidance and previously predicted mask as the input without considering the\ninvariant nature of the source image. As a result, extracting features from the\nsource image is repeated in each interaction, resulting in substantial\ncomputational redundancy. In this work, we propose the Feature\nDecoupling-Recycling Network (FDRN), which decouples the modeling components\nbased on their intrinsic discrepancies and then recycles components for each\nuser interaction. Thus, the efficiency of the whole interactive process can be\nsignificantly improved. To be specific, we apply the Decoupling-Recycling\nstrategy from three perspectives to address three types of discrepancies,\nrespectively. First, our model decouples the learning of source image semantics\nfrom the encoding of user guidance to process two types of input domains\nseparately. Second, FDRN decouples high-level and low-level features from\nstratified semantic representations to enhance feature learning. Third, during\nthe encoding of user guidance, current user guidance is decoupled from\nhistorical guidance to highlight the effect of current user guidance. We\nconduct extensive experiments on 6 datasets from different domains and\nmodalities, which demonstrate the following merits of our model: 1) superior\nefficiency than other methods, particularly advantageous in challenging\nscenarios requiring long-term interactions (up to 4.25x faster), while\nachieving favorable segmentation performance; 2) strong applicability to\nvarious methods serving as a universal enhancement technique; 3) well\ncross-task generalizability, e.g., to medical image segmentation, and\nrobustness against misleading user guidance.\n","authors":["Huimin Zeng","Weinong Wang","Xin Tao","Zhiwei Xiong","Yu-Wing Tai","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03529v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.14863v2","updated":"2023-08-07T12:13:05Z","published":"2023-07-27T13:49:27Z","title":"IML-ViT: Benchmarking Image Manipulation Localization by Vision\n  Transformer","summary":"  Advanced image tampering techniques are increasingly challenging the\ntrustworthiness of multimedia, leading to the development of Image Manipulation\nLocalization (IML). But what makes a good IML model? The answer lies in the way\nto capture artifacts. Exploiting artifacts requires the model to extract\nnon-semantic discrepancies between manipulated and authentic regions,\nnecessitating explicit comparisons between the two areas. With the\nself-attention mechanism, naturally, the Transformer should be a better\ncandidate to capture artifacts. However, due to limited datasets, there is\ncurrently no pure ViT-based approach for IML to serve as a benchmark, and CNNs\ndominate the entire task. Nevertheless, CNNs suffer from weak long-range and\nnon-semantic modeling. To bridge this gap, based on the fact that artifacts are\nsensitive to image resolution, amplified under multi-scale features, and\nmassive at the manipulation border, we formulate the answer to the former\nquestion as building a ViT with high-resolution capacity, multi-scale feature\nextraction capability, and manipulation edge supervision that could converge\nwith a small amount of data. We term this simple but effective ViT paradigm\nIML-ViT, which has significant potential to become a new benchmark for IML.\nExtensive experiments on five benchmark datasets verified our model outperforms\nthe state-of-the-art manipulation localization methods.Code and models are\navailable at \\url{https://github.com/SunnyHaze/IML-ViT}.\n","authors":["Xiaochen Ma","Bo Du","Zhuohang Jiang","Ahmed Y. Al Hammadi","Jizhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.14863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03515v1","updated":"2023-08-07T12:11:04Z","published":"2023-08-07T12:11:04Z","title":"Keyword Spotting Simplified: A Segmentation-Free Approach using\n  Character Counting and CTC re-scoring","summary":"  Recent advances in segmentation-free keyword spotting treat this problem\nw.r.t. an object detection paradigm and borrow from state-of-the-art detection\nsystems to simultaneously propose a word bounding box proposal mechanism and\ncompute a corresponding representation. Contrary to the norm of such methods\nthat rely on complex and large DNN models, we propose a novel segmentation-free\nsystem that efficiently scans a document image to find rectangular areas that\ninclude the query information. The underlying model is simple and compact,\npredicting character occurrences over rectangular areas through an implicitly\nlearned scale map, trained on word-level annotated images. The proposed\ndocument scanning is then performed using this character counting in a\ncost-effective manner via integral images and binary search. Finally, the\nretrieval similarity by character counting is refined by a pyramidal\nrepresentation and a CTC-based re-scoring algorithm, fully utilizing the\ntrained CNN model. Experimental validation on two widely-used datasets shows\nthat our method achieves state-of-the-art results outperforming the more\ncomplex alternatives, despite the simplicity of the underlying model.\n","authors":["George Retsinas","Giorgos Sfikas","Christophoros Nikou"],"pdf_url":"https://arxiv.org/pdf/2308.03515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03495v1","updated":"2023-08-07T11:42:50Z","published":"2023-08-07T11:42:50Z","title":"Balanced Face Dataset: Guiding StyleGAN to Generate Labeled Synthetic\n  Face Image Dataset for Underrepresented Group","summary":"  For a machine learning model to generalize effectively to unseen data within\na particular problem domain, it is well-understood that the data needs to be of\nsufficient size and representative of real-world scenarios. Nonetheless,\nreal-world datasets frequently have overrepresented and underrepresented\ngroups. One solution to mitigate bias in machine learning is to leverage a\ndiverse and representative dataset. Training a model on a dataset that covers\nall demographics is crucial to reducing bias in machine learning. However,\ncollecting and labeling large-scale datasets has been challenging, prompting\nthe use of synthetic data generation and active labeling to decrease the costs\nof manual labeling. The focus of this study was to generate a robust face image\ndataset using the StyleGAN model. In order to achieve a balanced distribution\nof the dataset among different demographic groups, a synthetic dataset was\ncreated by controlling the generation process of StyleGaN and annotated for\ndifferent downstream tasks.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2308.03495v1.pdf","comment":"7 pages, 7 figures,submitted to AMLD Africa 2021 conference"},{"id":"http://arxiv.org/abs/2208.11176v3","updated":"2023-08-07T11:36:16Z","published":"2022-08-23T20:04:17Z","title":"A Study on the Impact of Data Augmentation for Training Convolutional\n  Neural Networks in the Presence of Noisy Labels","summary":"  Label noise is common in large real-world datasets, and its presence harms\nthe training process of deep neural networks. Although several works have\nfocused on the training strategies to address this problem, there are few\nstudies that evaluate the impact of data augmentation as a design choice for\ntraining deep neural networks. In this work, we analyse the model robustness\nwhen using different data augmentations and their improvement on the training\nwith the presence of noisy labels. We evaluate state-of-the-art and classical\ndata augmentation strategies with different levels of synthetic noise for the\ndatasets MNist, CIFAR-10, CIFAR-100, and the real-world dataset Clothing1M. We\nevaluate the methods using the accuracy metric. Results show that the\nappropriate selection of data augmentation can drastically improve the model\nrobustness to label noise, increasing up to 177.84% of relative best test\naccuracy compared to the baseline with no augmentation, and an increase of up\nto 6% in absolute value with the state-of-the-art DivideMix training strategy.\n","authors":["Emeson Santana","Gustavo Carneiro","Filipe R. Cordeiro"],"pdf_url":"https://arxiv.org/pdf/2208.11176v3.pdf","comment":"Paper accepted at SIBGRAPI 2022"},{"id":"http://arxiv.org/abs/2308.03492v1","updated":"2023-08-07T11:34:27Z","published":"2023-08-07T11:34:27Z","title":"Learning Photometric Feature Transform for Free-form Object Scan","summary":"  We propose a novel framework to automatically learn to aggregate and\ntransform photometric measurements from multiple unstructured views into\nspatially distinctive and view-invariant low-level features, which are fed to a\nmulti-view stereo method to enhance 3D reconstruction. The illumination\nconditions during acquisition and the feature transform are jointly trained on\na large amount of synthetic data. We further build a system to reconstruct the\ngeometry and anisotropic reflectance of a variety of challenging objects from\nhand-held scans. The effectiveness of the system is demonstrated with a\nlightweight prototype, consisting of a camera and an array of LEDs, as well as\nan off-the-shelf tablet. Our results are validated against reconstructions from\na professional 3D scanner and photographs, and compare favorably with\nstate-of-the-art techniques.\n","authors":["Xiang Feng","Kaizhang Kang","Fan Pei","Huakeng Ding","Jinjiang You","Ping Tan","Kun Zhou","Hongzhi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13984v4","updated":"2023-08-07T11:29:26Z","published":"2022-10-24T07:43:59Z","title":"Abductive Action Inference","summary":"  Abductive reasoning aims to make the most likely inference for a given set of\nincomplete observations. In this paper, we introduce a novel research task\nknown as \"abductive action inference\" which addresses the question of which\nactions were executed by a human to reach a specific state shown in a single\nsnapshot. The research explores three key abductive inference problems: action\nset prediction, action sequence prediction, and abductive action verification.\nTo tackle these challenging tasks, we investigate various models, including\nestablished ones such as Transformers, Graph Neural Networks, CLIP, BLIP, GPT3,\nend-to-end trained Slow-Fast, Resnet50-3D, and ViT models. Furthermore, the\npaper introduces several innovative models tailored for abductive action\ninference, including a relational graph neural network, a relational bilinear\npooling model, a relational rule-based inference model, a relational GPT-3\nprompt method, and a relational Transformer model. Notably, the newly proposed\nobject-relational bilinear graph encoder-decoder (BiGED) model emerges as the\nmost effective among all methods evaluated, demonstrating good proficiency in\nhandling the intricacies of the Action Genome dataset. The contributions of\nthis research offer significant progress toward comprehending the implications\nof human actions and making highly plausible inferences concerning the outcomes\nof these actions.\n","authors":["Clement Tan","Chai Kiat Yeo","Cheston Tan","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2210.13984v4.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.03486v1","updated":"2023-08-07T11:28:36Z","published":"2023-08-07T11:28:36Z","title":"Improving Mass Detection in Mammography Images: A Study of Weakly\n  Supervised Learning and Class Activation Map Methods","summary":"  In recent years, weakly supervised models have aided in mass detection using\nmammography images, decreasing the need for pixel-level annotations. However,\nmost existing models in the literature rely on Class Activation Maps (CAM) as\nthe activation method, overlooking the potential benefits of exploring other\nactivation techniques. This work presents a study that explores and compares\ndifferent activation maps in conjunction with state-of-the-art methods for\nweakly supervised training in mammography images. Specifically, we investigate\nCAM, GradCAM, GradCAM++, XGradCAM, and LayerCAM methods within the framework of\nthe GMIC model for mass detection in mammography images. The evaluation is\nconducted on the VinDr-Mammo dataset, utilizing the metrics Accuracy, True\nPositive Rate (TPR), False Negative Rate (FNR), and False Positive Per Image\n(FPPI). Results show that using different strategies of activation maps during\ntraining and test stages leads to an improvement of the model. With this\nstrategy, we improve the results of the GMIC method, decreasing the FPPI value\nand increasing TPR.\n","authors":["Vicente Sampaio","Filipe R. Cordeiro"],"pdf_url":"https://arxiv.org/pdf/2308.03486v1.pdf","comment":"Accepted for publication at SIBGRAPI 20203"},{"id":"http://arxiv.org/abs/2307.08265v2","updated":"2023-08-07T11:21:31Z","published":"2023-07-17T06:14:19Z","title":"Extreme Image Compression using Fine-tuned VQGAN Models","summary":"  Recent advances in generative compression methods have demonstrated\nremarkable progress in enhancing the perceptual quality of compressed data,\nespecially in scenarios with low bitrates. Nevertheless, their efficacy and\napplicability in achieving extreme compression ratios ($<0.1$ bpp) still remain\nconstrained. In this work, we propose a simple yet effective coding framework\nby introducing vector quantization (VQ)-based generative models into the image\ncompression domain. The main insight is that the codebook learned by the VQGAN\nmodel yields strong expressive capacity, facilitating efficient compression of\ncontinuous information in the latent space while maintaining reconstruction\nquality. Specifically, an image can be represented as VQ-indices by finding the\nnearest codeword, which can be encoded using lossless compression methods into\nbitstreams. We then propose clustering a pre-trained large-scale codebook into\nsmaller codebooks using the K-means algorithm. This enables images to be\nrepresented as diverse ranges of VQ-indices maps, resulting in variable\nbitrates and different levels of reconstruction quality. Extensive qualitative\nand quantitative experiments on various datasets demonstrate that the proposed\nframework outperforms the state-of-the-art codecs in terms of perceptual\nquality-oriented metrics and human perception under extremely low bitrates.\n","authors":["Qi Mao","Tinghan Yang","Yinuo Zhang","Shuyin Pan","Meng Wang","Shiqi Wang","Siwei Ma"],"pdf_url":"https://arxiv.org/pdf/2307.08265v2.pdf","comment":"Generative Compression, Extreme Compression, VQGANs, Low Bitrate"},{"id":"http://arxiv.org/abs/2308.03476v1","updated":"2023-08-07T11:09:12Z","published":"2023-08-07T11:09:12Z","title":"Exploring the Physical World Adversarial Robustness of Vehicle Detection","summary":"  Adversarial attacks can compromise the robustness of real-world detection\nmodels. However, evaluating these models under real-world conditions poses\nchallenges due to resource-intensive experiments. Virtual simulations offer an\nalternative, but the absence of standardized benchmarks hampers progress.\nAddressing this, we propose an innovative instant-level data generation\npipeline using the CARLA simulator. Through this pipeline, we establish the\nDiscrete and Continuous Instant-level (DCI) dataset, enabling comprehensive\nexperiments involving three detection models and three physical adversarial\nattacks. Our findings highlight diverse model performances under adversarial\nconditions. Yolo v6 demonstrates remarkable resilience, experiencing just a\nmarginal 6.59% average drop in average precision (AP). In contrast, the ASA\nattack yields a substantial 14.51% average AP reduction, twice the effect of\nother algorithms. We also note that static scenes yield higher recognition AP\nvalues, and outcomes remain relatively consistent across varying weather\nconditions. Intriguingly, our study suggests that advancements in adversarial\nattack algorithms may be approaching its ``limitation''.In summary, our work\nunderscores the significance of adversarial attacks in real-world contexts and\nintroduces the DCI dataset as a versatile benchmark. Our findings provide\nvaluable insights for enhancing the robustness of detection models and offer\nguidance for future research endeavors in the realm of adversarial attacks.\n","authors":["Wei Jiang","Tianyuan Zhang","Shuangcheng Liu","Weiyu Ji","Zichao Zhang","Gang Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03471v1","updated":"2023-08-07T10:57:20Z","published":"2023-08-07T10:57:20Z","title":"Deepfake Detection: A Comparative Analysis","summary":"  This paper present a comprehensive comparative analysis of supervised and\nself-supervised models for deepfake detection. We evaluate eight supervised\ndeep learning architectures and two transformer-based models pre-trained using\nself-supervised strategies (DINO, CLIP) on four benchmarks (FakeAVCeleb,\nCelebDF-V2, DFDC, and FaceForensics++). Our analysis includes intra-dataset and\ninter-dataset evaluations, examining the best performing models, generalisation\ncapabilities, and impact of augmentations. We also investigate the trade-off\nbetween model size and performance. Our main goal is to provide insights into\nthe effectiveness of different deep learning architectures (transformers,\nCNNs), training strategies (supervised, self-supervised), and deepfake\ndetection benchmarks. These insights can help guide the development of more\naccurate and reliable deepfake detection systems, which are crucial in\nmitigating the harmful impact of deepfakes on individuals and society.\n","authors":["Sohail Ahmed Khan","Duc-Tien Dang-Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.03471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03467v1","updated":"2023-08-07T10:47:08Z","published":"2023-08-07T10:47:08Z","title":"RoadScan: A Novel and Robust Transfer Learning Framework for Autonomous\n  Pothole Detection in Roads","summary":"  This research paper presents a novel approach to pothole detection using Deep\nLearning and Image Processing techniques. The proposed system leverages the\nVGG16 model for feature extraction and utilizes a custom Siamese network with\ntriplet loss, referred to as RoadScan. The system aims to address the critical\nissue of potholes on roads, which pose significant risks to road users.\nAccidents due to potholes on the roads have led to numerous accidents. Although\nit is necessary to completely remove potholes, it is a time-consuming process.\nHence, a general road user should be able to detect potholes from a safe\ndistance in order to avoid damage. Existing methods for pothole detection\nheavily rely on object detection algorithms which tend to have a high chance of\nfailure owing to the similarity in structures and textures of a road and a\npothole. Additionally, these systems utilize millions of parameters thereby\nmaking the model difficult to use in small-scale applications for the general\ncitizen. By analyzing diverse image processing methods and various\nhigh-performing networks, the proposed model achieves remarkable performance in\naccurately detecting potholes. Evaluation metrics such as accuracy, EER,\nprecision, recall, and AUROC validate the effectiveness of the system.\nAdditionally, the proposed model demonstrates computational efficiency and\ncost-effectiveness by utilizing fewer parameters and data for training. The\nresearch highlights the importance of technology in the transportation sector\nand its potential to enhance road safety and convenience. The network proposed\nin this model performs with a 96.12 % accuracy, 3.89 % EER, and a 0.988 AUROC\nvalue, which is highly competitive with other state-of-the-art works.\n","authors":["Guruprasad Parasnis","Anmol Chokshi","Kailas Devadkar"],"pdf_url":"https://arxiv.org/pdf/2308.03467v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.15063v2","updated":"2023-08-07T10:43:33Z","published":"2023-07-27T17:59:59Z","title":"To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation","summary":"  The goal of Online Domain Adaptation for semantic segmentation is to handle\nunforeseeable domain changes that occur during deployment, like sudden weather\nevents. However, the high computational costs associated with brute-force\nadaptation make this paradigm unfeasible for real-world applications. In this\npaper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training\nframework for real-time domain adaptation. Our approach includes a\nhardware-aware back-propagation orchestration agent (HAMT) and a dedicated\ndomain-shift detector that enables active control over when and how the model\nis adapted (LT). Thanks to these advancements, our approach is capable of\nperforming semantic segmentation while simultaneously adapting at more than\n29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and\nspeed trade-off is demonstrated on OnDA and SHIFT benchmarks through\nexperimental results.\n","authors":["Marc Botet Colomer","Pier Luigi Dovesi","Theodoros Panagiotakopoulos","Joao Frederico Carvalho","Linus Härenstam-Nielsen","Hossein Azizpour","Hedvig Kjellström","Daniel Cremers","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2307.15063v2.pdf","comment":"ICCV 2023. The first two authors contributed equally. Project page:\n  https://marcbotet.github.io/hamlet-web/"},{"id":"http://arxiv.org/abs/2308.03463v1","updated":"2023-08-07T10:41:52Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.03463v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2210.15808v2","updated":"2023-08-07T10:33:34Z","published":"2022-10-28T00:03:43Z","title":"Hyper-Connected Transformer Network for Multi-Modality PET-CT\n  Segmentation","summary":"  [18F]-Fluorodeoxyglucose (FDG) positron emission tomography - computed\ntomography (PET-CT) has become the imaging modality of choice for diagnosing\nmany cancers. Co-learning complementary PET-CT imaging features is a\nfundamental requirement for automatic tumor segmentation and for developing\ncomputer aided cancer diagnosis systems. In this study, we propose a\nhyper-connected transformer (HCT) network that integrates a transformer network\n(TN) with a hyper connected fusion for multi-modality PET-CT images. The TN was\nleveraged for its ability to provide global dependencies in image feature\nlearning, which was achieved by using image patch embeddings with a\nself-attention mechanism to capture image-wide contextual information. We\nextended the single-modality definition of TN with multiple TN based branches\nto separately extract image features. We also introduced a hyper connected\nfusion to fuse the contextual and complementary image features across multiple\ntransformers in an iterative manner. Our results with two clinical datasets\nshow that HCT achieved better performance in segmentation accuracy when\ncompared to the existing methods.\n","authors":["Lei Bi","Michael Fulham","Shaoli Song","David Dagan Feng","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2210.15808v2.pdf","comment":"EMBC 2023"},{"id":"http://arxiv.org/abs/2308.03457v1","updated":"2023-08-07T10:25:54Z","published":"2023-08-07T10:25:54Z","title":"Cross-Silo Prototypical Calibration for Federated Learning with Non-IID\n  Data","summary":"  Federated Learning aims to learn a global model on the server side that\ngeneralizes to all clients in a privacy-preserving manner, by leveraging the\nlocal models from different clients. Existing solutions focus on either\nregularizing the objective functions among clients or improving the aggregation\nmechanism for the improved model generalization capability. However, their\nperformance is typically limited by the dataset biases, such as the\nheterogeneous data distributions and the missing classes. To address this\nissue, this paper presents a cross-silo prototypical calibration method\n(FedCSPC), which takes additional prototype information from the clients to\nlearn a unified feature space on the server side. Specifically, FedCSPC first\nemploys the Data Prototypical Modeling (DPM) module to learn data patterns via\nclustering to aid calibration. Subsequently, the cross-silo prototypical\ncalibration (CSPC) module develops an augmented contrastive learning method to\nimprove the robustness of the calibration, which can effectively project\ncross-source features into a consistent space while maintaining clear decision\nboundaries. Moreover, the CSPC module's ease of implementation and\nplug-and-play characteristics make it even more remarkable. Experiments were\nconducted on four datasets in terms of performance comparison, ablation study,\nin-depth analysis and case study, and the results verified that FedCSPC is\ncapable of learning the consistent features across different data sources of\nthe same class under the guidance of calibrated model, which leads to better\nperformance than the state-of-the-art methods. The source codes have been\nreleased at https://github.com/qizhuang-qz/FedCSPC.\n","authors":["Zhuang Qi","Lei Meng","Zitan Chen","Han Hu","Hui Lin","Xiangxu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.03457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12043v2","updated":"2023-08-07T10:20:59Z","published":"2023-04-24T12:38:09Z","title":"MixPro: Data Augmentation with MaskMix and Progressive Attention\n  Labeling for Vision Transformer","summary":"  The recently proposed data augmentation TransMix employs attention labels to\nhelp visual transformers (ViT) achieve better robustness and performance.\nHowever, TransMix is deficient in two aspects: 1) The image cropping method of\nTransMix may not be suitable for ViTs. 2) At the early stage of training, the\nmodel produces unreliable attention maps. TransMix uses unreliable attention\nmaps to compute mixed attention labels that can affect the model. To address\nthe aforementioned issues, we propose MaskMix and Progressive Attention\nLabeling (PAL) in image and label space, respectively. In detail, from the\nperspective of image space, we design MaskMix, which mixes two images based on\na patch-like grid mask. In particular, the size of each mask patch is\nadjustable and is a multiple of the image patch size, which ensures each image\npatch comes from only one image and contains more global contents. From the\nperspective of label space, we design PAL, which utilizes a progressive factor\nto dynamically re-weight the attention weights of the mixed attention label.\nFinally, we combine MaskMix and Progressive Attention Labeling as our new data\naugmentation method, named MixPro. The experimental results show that our\nmethod can improve various ViT-based models at scales on ImageNet\nclassification (73.8\\% top-1 accuracy based on DeiT-T for 300 epochs). After\nbeing pre-trained with MixPro on ImageNet, the ViT-based models also\ndemonstrate better transferability to semantic segmentation, object detection,\nand instance segmentation. Furthermore, compared to TransMix, MixPro also shows\nstronger robustness on several benchmarks. The code is available at\nhttps://github.com/fistyee/MixPro.\n","authors":["Qihao Zhao","Yangyu Huang","Wei Hu","Fan Zhang","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2304.12043v2.pdf","comment":"ICLR 2023, 16 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.07176v2","updated":"2023-08-07T10:09:21Z","published":"2023-05-11T23:12:13Z","title":"Automatic Radiology Report Generation by Learning with Increasingly Hard\n  Negatives","summary":"  Automatic radiology report generation is challenging as medical images or\nreports are usually similar to each other due to the common content of anatomy.\nThis makes a model hard to capture the uniqueness of individual images and is\nprone to producing undesired generic or mismatched reports. This situation\ncalls for learning more discriminative features that could capture even\nfine-grained mismatches between images and reports. To achieve this, this paper\nproposes a novel framework to learn discriminative image and report features by\ndistinguishing them from their closest peers, i.e., hard negatives. Especially,\nto attain more discriminative features, we gradually raise the difficulty of\nsuch a learning task by creating increasingly hard negative reports for each\nimage in the feature space during training, respectively. By treating the\nincreasingly hard negatives as auxiliary variables, we formulate this process\nas a min-max alternating optimisation problem. At each iteration, conditioned\non a given set of hard negative reports, image and report features are learned\nas usual by minimising the loss functions related to report generation. After\nthat, a new set of harder negative reports will be created by maximising a loss\nreflecting image-report alignment. By solving this optimisation, we attain a\nmodel that can generate more specific and accurate reports. It is noteworthy\nthat our framework enhances discriminative feature learning without introducing\nextra network weights. Also, in contrast to the existing way of generating hard\nnegatives, our framework extends beyond the granularity of the dataset by\ngenerating harder samples out of the training set. Experimental study on\nbenchmark datasets verifies the efficacy of our framework and shows that it can\nserve as a plug-in to readily improve existing medical report generation\nmodels.\n","authors":["Bhanu Prakash Voutharoja","Lei Wang","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.07176v2.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n  2023"},{"id":"http://arxiv.org/abs/2308.03448v1","updated":"2023-08-07T10:09:11Z","published":"2023-08-07T10:09:11Z","title":"Lighting Every Darkness in Two Pairs: A Calibration-Free Pipeline for\n  RAW Denoising","summary":"  Calibration-based methods have dominated RAW image denoising under extremely\nlow-light environments. However, these methods suffer from several main\ndeficiencies: 1) the calibration procedure is laborious and time-consuming, 2)\ndenoisers for different cameras are difficult to transfer, and 3) the\ndiscrepancy between synthetic noise and real noise is enlarged by high digital\ngain. To overcome the above shortcomings, we propose a calibration-free\npipeline for Lighting Every Drakness (LED), regardless of the digital gain or\ncamera sensor. Instead of calibrating the noise parameters and training\nrepeatedly, our method could adapt to a target camera only with few-shot paired\ndata and fine-tuning. In addition, well-designed structural modification during\nboth stages alleviates the domain gap between synthetic and real noise without\nany extra computational cost. With 2 pairs for each additional digital gain (in\ntotal 6 pairs) and 0.5% iterations, our method achieves superior performance\nover other calibration-based methods. Our code is available at\nhttps://github.com/Srameo/LED .\n","authors":["Xin Jin","Jia-Wen Xiao","Ling-Hao Han","Chunle Guo","Ruixun Zhang","Xialei Liu","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.03448v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2306.09780v2","updated":"2023-08-07T09:25:55Z","published":"2023-06-16T11:33:47Z","title":"Understanding Deep Generative Models with Generalized Empirical\n  Likelihoods","summary":"  Understanding how well a deep generative model captures a distribution of\nhigh-dimensional data remains an important open challenge. It is especially\ndifficult for certain model classes, such as Generative Adversarial Networks\nand Diffusion Models, whose models do not admit exact likelihoods. In this\nwork, we demonstrate that generalized empirical likelihood (GEL) methods offer\na family of diagnostic tools that can identify many deficiencies of deep\ngenerative models (DGMs). We show, with appropriate specification of moment\nconditions, that the proposed method can identify which modes have been\ndropped, the degree to which DGMs are mode imbalanced, and whether DGMs\nsufficiently capture intra-class diversity. We show how to combine techniques\nfrom Maximum Mean Discrepancy and Generalized Empirical Likelihood to create\nnot only distribution tests that retain per-sample interpretability, but also\nmetrics that include label information. We find that such tests predict the\ndegree of mode dropping and mode imbalance up to 60% better than metrics such\nas improved precision/recall. We provide an implementation at\nhttps://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.\n","authors":["Suman Ravuri","Mélanie Rey","Shakir Mohamed","Marc Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2306.09780v2.pdf","comment":"Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of\n  submissions)"},{"id":"http://arxiv.org/abs/2308.03413v1","updated":"2023-08-07T09:03:35Z","published":"2023-08-07T09:03:35Z","title":"GaFET: Learning Geometry-aware Facial Expression Translation from\n  In-The-Wild Images","summary":"  While current face animation methods can manipulate expressions individually,\nthey suffer from several limitations. The expressions manipulated by some\nmotion-based facial reenactment models are crude. Other ideas modeled with\nfacial action units cannot generalize to arbitrary expressions not covered by\nannotations. In this paper, we introduce a novel Geometry-aware Facial\nExpression Translation (GaFET) framework, which is based on parametric 3D\nfacial representations and can stably decoupled expression. Among them, a\nMulti-level Feature Aligned Transformer is proposed to complement non-geometric\nfacial detail features while addressing the alignment challenge of spatial\nfeatures. Further, we design a De-expression model based on StyleGAN, in order\nto reduce the learning difficulty of GaFET in unpaired \"in-the-wild\" images.\nExtensive qualitative and quantitative experiments demonstrate that we achieve\nhigher-quality and more accurate facial expression transfer results compared to\nstate-of-the-art methods, and demonstrate applicability of various poses and\ncomplex textures. Besides, videos or annotated training data are omitted,\nmaking our method easier to use and generalize.\n","authors":["Tianxiang Ma","Bingchuan Li","Qian He","Jing Dong","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2308.03413v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03411v1","updated":"2023-08-07T09:02:26Z","published":"2023-08-07T09:02:26Z","title":"A Horse with no Labels: Self-Supervised Horse Pose Estimation from\n  Unlabelled Images and Synthetic Prior","summary":"  Obtaining labelled data to train deep learning methods for estimating animal\npose is challenging. Recently, synthetic data has been widely used for pose\nestimation tasks, but most methods still rely on supervised learning paradigms\nutilising synthetic images and labels. Can training be fully unsupervised? Is a\ntiny synthetic dataset sufficient? What are the minimum assumptions that we\ncould make for estimating animal pose? Our proposal addresses these questions\nthrough a simple yet effective self-supervised method that only assumes the\navailability of unlabelled images and a small set of synthetic 2D poses. We\ncompletely remove the need for any 3D or 2D pose annotations (or complex 3D\nanimal models), and surprisingly our approach can still learn accurate 3D and\n2D poses simultaneously. We train our method with unlabelled images of horses\nmainly collected for YouTube videos and a prior consisting of 2D synthetic\nposes. The latter is three times smaller than the number of images needed for\ntraining. We test our method on a challenging set of horse images and evaluate\nthe predicted 3D and 2D poses. We demonstrate that it is possible to learn\naccurate animal poses even with as few assumptions as unlabelled images and a\nsmall set of 2D poses generated from synthetic data. Given the minimum\nrequirements and the abundance of unlabelled data, our method could be easily\ndeployed to different animals.\n","authors":["Jose Sosa","David Hogg"],"pdf_url":"https://arxiv.org/pdf/2308.03411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03409v1","updated":"2023-08-07T08:55:48Z","published":"2023-08-07T08:55:48Z","title":"DiT: Efficient Vision Transformers with Dynamic Token Routing","summary":"  Recently, the tokens of images share the same static data flow in many dense\nnetworks. However, challenges arise from the variance among the objects in\nimages, such as large variations in the spatial scale and difficulties of\nrecognition for visual entities. In this paper, we propose a data-dependent\ntoken routing strategy to elaborate the routing paths of image tokens for\nDynamic Vision Transformer, dubbed DiT. The proposed framework generates a\ndata-dependent path per token, adapting to the object scales and visual\ndiscrimination of tokens. In feed-forward, the differentiable routing gates are\ndesigned to select the scaling paths and feature transformation paths for image\ntokens, leading to multi-path feature propagation. In this way, the impact of\nobject scales and visual discrimination of image representation can be\ncarefully tuned. Moreover, the computational cost can be further reduced by\ngiving budget constraints to the routing gate and early-stopping of feature\nextraction. In experiments, our DiT achieves superior performance and favorable\ncomplexity/accuracy trade-offs than many SoTA methods on ImageNet\nclassification, object detection, instance segmentation, and semantic\nsegmentation. Particularly, the DiT-B5 obtains 84.8\\% top-1 Acc on ImageNet\nwith 10.3 GFLOPs, which is 1.0\\% higher than that of the SoTA method with\nsimilar computational complexity. These extensive results demonstrate that DiT\ncan serve as versatile backbones for various vision tasks.\n","authors":["Yuchen Ma","Zhengcong Fei","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03407v1","updated":"2023-08-07T08:48:46Z","published":"2023-08-07T08:48:46Z","title":"Spatially Varying Nanophotonic Neural Networks","summary":"  The explosive growth of computation and energy cost of artificial\nintelligence has spurred strong interests in new computing modalities as\npotential alternatives to conventional electronic processors. Photonic\nprocessors that execute operations using photons instead of electrons, have\npromised to enable optical neural networks with ultra-low latency and power\nconsumption. However, existing optical neural networks, limited by the\nunderlying network designs, have achieved image recognition accuracy much lower\nthan state-of-the-art electronic neural networks. In this work, we close this\ngap by introducing a large-kernel spatially-varying convolutional neural\nnetwork learned via low-dimensional reparameterization techniques. We\nexperimentally instantiate the network with a flat meta-optical system that\nencompasses an array of nanophotonic structures designed to induce\nangle-dependent responses. Combined with an extremely lightweight electronic\nbackend with approximately 2K parameters we demonstrate a nanophotonic neural\nnetwork reaches 73.80\\% blind test classification accuracy on CIFAR-10 dataset,\nand, as such, the first time, an optical neural network outperforms the first\nmodern digital neural network -- AlexNet (72.64\\%) with 57M parameters,\nbringing optical neural network into modern deep learning era.\n","authors":["Kaixuan Wei","Xiao Li","Johannes Froech","Praneeth Chakravarthula","James Whitehead","Ethan Tseng","Arka Majumdar","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2308.03407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08283v3","updated":"2023-08-07T08:32:54Z","published":"2022-12-16T05:10:09Z","title":"SceneGATE: Scene-Graph based co-Attention networks for TExt visual\n  question answering","summary":"  Most TextVQA approaches focus on the integration of objects, scene texts and\nquestion words by a simple transformer encoder. But this fails to capture the\nsemantic relations between different modalities. The paper proposes a Scene\nGraph based co-Attention Network (SceneGATE) for TextVQA, which reveals the\nsemantic relations among the objects, Optical Character Recognition (OCR)\ntokens and the question words. It is achieved by a TextVQA-based scene graph\nthat discovers the underlying semantics of an image. We created a\nguided-attention module to capture the intra-modal interplay between the\nlanguage and the vision as a guidance for inter-modal interactions. To make\nexplicit teaching of the relations between the two modalities, we proposed and\nintegrated two attention modules, namely a scene graph-based semantic\nrelation-aware attention and a positional relation-aware attention. We\nconducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.\nIt is shown that our SceneGATE method outperformed existing ones because of the\nscene graph and its attention modules.\n","authors":["Feiqi Cao","Siwen Luo","Felipe Nunez","Zean Wen","Josiah Poon","Caren Han"],"pdf_url":"https://arxiv.org/pdf/2212.08283v3.pdf","comment":"Published in Robotics (Q1, SCI indexed Journal):\n  https://www.mdpi.com/2218-6581/12/4/114"},{"id":"http://arxiv.org/abs/2307.13294v2","updated":"2023-08-07T08:12:57Z","published":"2023-07-25T07:20:21Z","title":"Imperceptible Physical Attack against Face Recognition Systems via LED\n  Illumination Modulation","summary":"  Although face recognition starts to play an important role in our daily life,\nwe need to pay attention that data-driven face recognition vision systems are\nvulnerable to adversarial attacks. However, the current two categories of\nadversarial attacks, namely digital attacks and physical attacks both have\ndrawbacks, with the former ones impractical and the latter one conspicuous,\nhigh-computational and inexecutable. To address the issues, we propose a\npractical, executable, inconspicuous and low computational adversarial attack\nbased on LED illumination modulation. To fool the systems, the proposed attack\ngenerates imperceptible luminance changes to human eyes through fast intensity\nmodulation of scene LED illumination and uses the rolling shutter effect of\nCMOS image sensors in face recognition systems to implant luminance information\nperturbation to the captured face images. In summary,we present a\ndenial-of-service (DoS) attack for face detection and a dodging attack for face\nverification. We also evaluate their effectiveness against well-known face\ndetection models, Dlib, MTCNN and RetinaFace , and face verification models,\nDlib, FaceNet,and ArcFace.The extensive experiments show that the success rates\nof DoS attacks against face detection models reach 97.67%, 100%, and 100%,\nrespectively, and the success rates of dodging attacks against all face\nverification models reach 100%.\n","authors":["Junbin Fang","Canjian Jiang","You Jiang","Puxi Lin","Zhaojie Chen","Yujing Sun","Siu-Ming Yiu","Zoe L. Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.13294v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09036v2","updated":"2023-08-07T08:10:55Z","published":"2023-03-16T02:18:41Z","title":"Mimic3D: Thriving 3D-Aware GANs via 3D-to-2D Imitation","summary":"  Generating images with both photorealism and multiview 3D consistency is\ncrucial for 3D-aware GANs, yet existing methods struggle to achieve them\nsimultaneously. Improving the photorealism via CNN-based 2D super-resolution\ncan break the strict 3D consistency, while keeping the 3D consistency by\nlearning high-resolution 3D representations for direct rendering often\ncompromises image quality. In this paper, we propose a novel learning strategy,\nnamely 3D-to-2D imitation, which enables a 3D-aware GAN to generate\nhigh-quality images while maintaining their strict 3D consistency, by letting\nthe images synthesized by the generator's 3D rendering branch to mimic those\ngenerated by its 2D super-resolution branch. We also introduce 3D-aware\nconvolutions into the generator for better 3D representation learning, which\nfurther improves the image generation quality. With the above strategies, our\nmethod reaches FID scores of 5.4 and 4.3 on FFHQ and AFHQ-v2 Cats,\nrespectively, at 512x512 resolution, largely outperforming existing 3D-aware\nGANs using direct 3D rendering and coming very close to the previous\nstate-of-the-art method that leverages 2D super-resolution. Project website:\nhttps://seanchenxy.github.io/Mimic3DWeb.\n","authors":["Xingyu Chen","Yu Deng","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2303.09036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03382v1","updated":"2023-08-07T08:03:20Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n  Residual U-Blocks Network","summary":"  Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v1.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n  imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2308.01661v3","updated":"2023-08-07T08:00:36Z","published":"2023-08-03T09:56:31Z","title":"BEVControl: Accurately Controlling Street-view Elements with\n  Multi-perspective Consistency via BEV Sketch Layout","summary":"  Using synthesized images to boost the performance of perception models is a\nlong-standing research challenge in computer vision. It becomes more eminent in\nvisual-centric autonomous driving systems with multi-view cameras as some\nlong-tail scenarios can never be collected. Guided by the BEV segmentation\nlayouts, the existing generative networks seem to synthesize photo-realistic\nstreet-view images when evaluated solely on scene-level metrics. However, once\nzoom-in, they usually fail to produce accurate foreground and background\ndetails such as heading. To this end, we propose a two-stage generative method,\ndubbed BEVControl, that can generate accurate foreground and background\ncontents. In contrast to segmentation-like input, it also supports sketch style\ninput, which is more flexible for humans to edit. In addition, we propose a\ncomprehensive multi-level evaluation protocol to fairly compare the quality of\nthe generated scene, foreground object, and background geometry. Our extensive\nexperiments show that our BEVControl surpasses the state-of-the-art method,\nBEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation\nmIoU. In addition, we show that using images generated by BEVControl to train\nthe downstream perception model, it achieves on average 1.29 improvement in NDS\nscore.\n","authors":["Kairui Yang","Enhui Ma","Jibin Peng","Qing Guo","Di Lin","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01661v3.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.03381v1","updated":"2023-08-07T07:59:56Z","published":"2023-08-07T07:59:56Z","title":"Bilevel Generative Learning for Low-Light Vision","summary":"  Recently, there has been a growing interest in constructing deep learning\nschemes for Low-Light Vision (LLV). Existing techniques primarily focus on\ndesigning task-specific and data-dependent vision models on the standard RGB\ndomain, which inherently contain latent data associations. In this study, we\npropose a generic low-light vision solution by introducing a generative block\nto convert data from the RAW to the RGB domain. This novel approach connects\ndiverse vision problems by explicitly depicting data generation, which is the\nfirst in the field. To precisely characterize the latent correspondence between\nthe generative procedure and the vision task, we establish a bilevel model with\nthe parameters of the generative block defined as the upper level and the\nparameters of the vision task defined as the lower level. We further develop\ntwo types of learning strategies targeting different goals, namely low cost and\nhigh accuracy, to acquire a new bilevel generative learning paradigm. The\ngenerative blocks embrace a strong generalization ability in other low-light\nvision tasks through the bilevel optimization on enhancement tasks. Extensive\nexperimental evaluations on three representative low-light vision tasks, namely\nenhancement, detection, and segmentation, fully demonstrate the superiority of\nour proposed approach. The code will be available at\nhttps://github.com/Yingchi1998/BGL.\n","authors":["Yingchi Liu","Zhu Liu","Long Ma","Jinyuan Liu","Xin Fan","Zhongxuan Luo","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03381v1.pdf","comment":"Accepted by ACM MM'2023, The code will be available at\n  https://github.com/Yingchi1998/BGL"},{"id":"http://arxiv.org/abs/2308.03375v1","updated":"2023-08-07T07:54:32Z","published":"2023-08-07T07:54:32Z","title":"VR-based body tracking to stimulate musculoskeletal training","summary":"  Training helps to maintain and improve sufficient muscle function, body\ncontrol, and body coordination. These are important to reduce the risk of\nfracture incidents caused by falls, especially for the elderly or people\nrecovering from injury. Virtual reality training can offer a cost-effective and\nindividualized training experience. We present an application for the HoloLens\n2 to enable musculoskeletal training for elderly and impaired persons to allow\nfor autonomous training and automatic progress evaluation. We designed a\nvirtual downhill skiing scenario that is controlled by body movement to\nstimulate balance and body control. By adapting the parameters of the ski\nslope, we can tailor the intensity of the training to individual users. In this\nwork, we evaluate whether the movement data of the HoloLens 2 alone is\nsufficient to control and predict body movement and joint angles during\nmusculoskeletal training. We record the movements of 10 healthy volunteers with\nexternal tracking cameras and track a set of body and joint angles of the\nparticipant during training. We estimate correlation coefficients and\nsystematically analyze whether whole body movement can be derived from the\nmovement data of the HoloLens 2. No participant reports movement sickness\neffects and all were able to quickly interact and control their movement during\nskiing. Our results show a high correlation between HoloLens 2 movement data\nand the external tracking of the upper body movement and joint angles of the\nlower limbs.\n","authors":["M. Neidhardt","S. Gerlach F. N. Schmidt","I. A. K. Fiedler","S. Grube","B. Busse","A. Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2308.03375v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2308.03374v1","updated":"2023-08-07T07:53:39Z","published":"2023-08-07T07:53:39Z","title":"Heterogeneous Forgetting Compensation for Class-Incremental Learning","summary":"  Class-incremental learning (CIL) has achieved remarkable successes in\nlearning new classes consecutively while overcoming catastrophic forgetting on\nold categories. However, most existing CIL methods unreasonably assume that all\nold categories have the same forgetting pace, and neglect negative influence of\nforgetting heterogeneity among different old classes on forgetting\ncompensation. To surmount the above challenges, we develop a novel\nHeterogeneous Forgetting Compensation (HFC) model, which can resolve\nheterogeneous forgetting of easy-to-forget and hard-to-forget old categories\nfrom both representation and gradient aspects. Specifically, we design a\ntask-semantic aggregation block to alleviate heterogeneous forgetting from\nrepresentation aspect. It aggregates local category information within each\ntask to learn task-shared global representations. Moreover, we develop two\nnovel plug-and-play losses: a gradient-balanced forgetting compensation loss\nand a gradient-balanced relation distillation loss to alleviate forgetting from\ngradient aspect. They consider gradient-balanced compensation to rectify\nforgetting heterogeneity of old categories and heterogeneous relation\nconsistency. Experiments on several representative datasets illustrate\neffectiveness of our HFC model. The code is available at\nhttps://github.com/JiahuaDong/HFC.\n","authors":["Jiahua Dong","Wenqi Liang","Yang Cong","Gan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.03374v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2304.14104v2","updated":"2023-08-07T07:52:35Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v2.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2307.13925v3","updated":"2023-08-07T07:40:39Z","published":"2023-07-26T02:46:50Z","title":"EasyNet: An Easy Network for 3D Industrial Anomaly Detection","summary":"  3D anomaly detection is an emerging and vital computer vision task in\nindustrial manufacturing (IM). Recently many advanced algorithms have been\npublished, but most of them cannot meet the needs of IM. There are several\ndisadvantages: i) difficult to deploy on production lines since their\nalgorithms heavily rely on large pre-trained models; ii) hugely increase\nstorage overhead due to overuse of memory banks; iii) the inference speed\ncannot be achieved in real-time. To overcome these issues, we propose an easy\nand deployment-friendly network (called EasyNet) without using pre-trained\nmodels and memory banks: firstly, we design a multi-scale multi-modality\nfeature encoder-decoder to accurately reconstruct the segmentation maps of\nanomalous regions and encourage the interaction between RGB images and depth\nimages; secondly, we adopt a multi-modality anomaly segmentation network to\nachieve a precise anomaly map; thirdly, we propose an attention-based\ninformation entropy fusion module for feature fusion during inference, making\nit suitable for real-time deployment. Extensive experiments show that EasyNet\nachieves an anomaly detection AUROC of 92.6% without using pre-trained models\nand memory banks. In addition, EasyNet is faster than existing methods, with a\nhigh frame rate of 94.55 FPS on a Tesla V100 GPU.\n","authors":["Ruitao Chen","Guoyang Xie","Jiaqi Liu","Jinbao Wang","Ziqi Luo","Jinfan Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.13925v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03364v1","updated":"2023-08-07T07:39:39Z","published":"2023-08-07T07:39:39Z","title":"Dual Aggregation Transformer for Image Super-Resolution","summary":"  Transformer has recently gained considerable popularity in low-level vision\ntasks, including image super-resolution (SR). These networks utilize\nself-attention along different dimensions, spatial or channel, and achieve\nimpressive performance. This inspires us to combine the two dimensions in\nTransformer for a more powerful representation capability. Based on the above\nidea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),\nfor image SR. Our DAT aggregates features across spatial and channel\ndimensions, in the inter-block and intra-block dual manner. Specifically, we\nalternately apply spatial and channel self-attention in consecutive Transformer\nblocks. The alternate strategy enables DAT to capture the global context and\nrealize inter-block feature aggregation. Furthermore, we propose the adaptive\ninteraction module (AIM) and the spatial-gate feed-forward network (SGFN) to\nachieve intra-block feature aggregation. AIM complements two self-attention\nmechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional\nnon-linear spatial information in the feed-forward network. Extensive\nexperiments show that our DAT surpasses current methods. Code and models are\nobtainable at https://github.com/zhengchen1999/DAT.\n","authors":["Zheng Chen","Yulun Zhang","Jinjin Gu","Linghe Kong","Xiaokang Yang","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03364v1.pdf","comment":"Accepted to ICCV 2023. Code is available at\n  https://github.com/zhengchen1999/DAT"},{"id":"http://arxiv.org/abs/2308.03359v1","updated":"2023-08-07T07:28:24Z","published":"2023-08-07T07:28:24Z","title":"Distortion-aware Transformer in 360° Salient Object Detection","summary":"  With the emergence of VR and AR, 360{\\deg} data attracts increasing attention\nfrom the computer vision and multimedia communities. Typically, 360{\\deg} data\nis projected into 2D ERP (equirectangular projection) images for feature\nextraction. However, existing methods cannot handle the distortions that result\nfrom the projection, hindering the development of 360-data-based tasks.\nTherefore, in this paper, we propose a Transformer-based model called DATFormer\nto address the distortion problem. We tackle this issue from two perspectives.\nFirstly, we introduce two distortion-adaptive modules. The first is a\nDistortion Mapping Module, which guides the model to pre-adapt to distorted\nfeatures globally. The second module is a Distortion-Adaptive Attention Block\nthat reduces local distortions on multi-scale features. Secondly, to exploit\nthe unique characteristics of 360{\\deg} data, we present a learnable relation\nmatrix and use it as part of the positional embedding to further improve\nperformance. Extensive experiments are conducted on three public datasets, and\nthe results show that our model outperforms existing 2D SOD (salient object\ndetection) and 360 SOD methods.\n","authors":["Yinjie Zhao","Lichen Zhao","Qian Yu","Jing Zhang","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03359v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.03532v2","updated":"2023-08-07T07:25:34Z","published":"2023-04-07T08:11:16Z","title":"Graph-Guided MLP-Mixer for Skeleton-Based Human Motion Prediction","summary":"  In recent years, Graph Convolutional Networks (GCNs) have been widely used in\nhuman motion prediction, but their performance remains unsatisfactory.\nRecently, MLP-Mixer, initially developed for vision tasks, has been leveraged\ninto human motion prediction as a promising alternative to GCNs, which achieves\nboth better performance and better efficiency than GCNs. Unlike GCNs, which can\nexplicitly capture human skeleton's bone-joint structure by representing it as\na graph with edges and nodes, MLP-Mixer relies on fully connected layers and\nthus cannot explicitly model such graph-like structure of human's. To break\nthis limitation of MLP-Mixer's, we propose \\textit{Graph-Guided Mixer}, a novel\napproach that equips the original MLP-Mixer architecture with the capability to\nmodel graph structure. By incorporating graph guidance, our\n\\textit{Graph-Guided Mixer} can effectively capture and utilize the specific\nconnectivity patterns within human skeleton's graph representation. In this\npaper, first we uncover a theoretical connection between MLP-Mixer and GCN that\nis unexplored in existing research. Building on this theoretical connection,\nnext we present our proposed \\textit{Graph-Guided Mixer}, explaining how the\noriginal MLP-Mixer architecture is reinvented to incorporate guidance from\ngraph structure. Then we conduct an extensive evaluation on the Human3.6M,\nAMASS, and 3DPW datasets, which shows that our method achieves state-of-the-art\nperformance.\n","authors":["Xinshun Wang","Qiongjie Cui","Chen Chen","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2304.03532v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03354v1","updated":"2023-08-07T07:23:43Z","published":"2023-08-07T07:23:43Z","title":"Energy-Guided Diffusion Model for CBCT-to-CT Synthesis","summary":"  Cone Beam CT (CBCT) plays a crucial role in Adaptive Radiation Therapy (ART)\nby accurately providing radiation treatment when organ anatomy changes occur.\nHowever, CBCT images suffer from scatter noise and artifacts, making relying\nsolely on CBCT for precise dose calculation and accurate tissue localization\nchallenging. Therefore, there is a need to improve CBCT image quality and\nHounsfield Unit (HU) accuracy while preserving anatomical structures. To\nenhance the role and application value of CBCT in ART, we propose an\nenergy-guided diffusion model (EGDiff) and conduct experiments on a chest tumor\ndataset to generate synthetic CT (sCT) from CBCT. The experimental results\ndemonstrate impressive performance with an average absolute error of\n26.87$\\pm$6.14 HU, a structural similarity index measurement of 0.850$\\pm$0.03,\na peak signal-to-noise ratio of the sCT of 19.83$\\pm$1.39 dB, and a normalized\ncross-correlation of the sCT of 0.874$\\pm$0.04. These results indicate that our\nmethod outperforms state-of-the-art unsupervised synthesis methods in accuracy\nand visual quality, producing superior sCT images.\n","authors":["Linjie Fu","Xia Li","Xiuding Cai","Dong Miao","Yu Yao","Yali Shen"],"pdf_url":"https://arxiv.org/pdf/2308.03354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03349v1","updated":"2023-08-07T07:03:49Z","published":"2023-08-07T07:03:49Z","title":"SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering\n  Dataset for Scientific Graphs","summary":"  In this work, we present SciGraphQA, a synthetic multi-turn question-answer\ndataset related to academic graphs. SciGraphQA is 13 times larger than\nChartVQA, the previously largest chart-visual question-answering dataset. It is\nalso the largest open-sourced chart VQA dataset with non-synthetic charts. To\nbuild our dataset, we selected 290,000 Computer Science or Machine Learning\nArXiv papers published between 2010 and 2020, and then used Palm-2 to generate\n295K samples of open-vocabulary multi-turn question-answering dialogues about\nthe graphs. As context, we provided the text-only Palm-2 with paper title,\nabstract, paragraph mentioning the graph, and rich text contextual data from\nthe graph itself, obtaining dialogues with an average 2.23 question-answer\nturns for each graph. We asked GPT-4 to assess the matching quality of our\nquestion-answer turns given the paper's context, obtaining an average rating of\n8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most\npopular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our\ndataset, finding LLaVA-13B being the most performant with a CIDEr score of\n0.08. We further enriched the question prompts for LLAVA by including the\nserialized data tables extracted from the graphs using the DePlot model,\nboosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,\nwe also fine-tuned LLaVa using our dataset, reaching a substantially higher\nCIDEr score of 0.26. We anticipate further accuracy improvement by including\nsegmentation mask tokens and leveraging larger LLM backbones coupled with\nemergent prompting techniques. Our code and data are open-sourced.\n","authors":["Shengzhi Li","Nima Tajbakhsh"],"pdf_url":"https://arxiv.org/pdf/2308.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03348v1","updated":"2023-08-07T07:02:42Z","published":"2023-08-07T07:02:42Z","title":"Cooperative Colorization: Exploring Latent Cross-Domain Priors for NIR\n  Image Spectrum Translation","summary":"  Near-infrared (NIR) image spectrum translation is a challenging problem with\nmany promising applications. Existing methods struggle with the mapping\nambiguity between the NIR and the RGB domains, and generalize poorly due to the\nlimitations of models' learning capabilities and the unavailability of\nsufficient NIR-RGB image pairs for training. To address these challenges, we\npropose a cooperative learning paradigm that colorizes NIR images in parallel\nwith another proxy grayscale colorization task by exploring latent cross-domain\npriors (i.e., latent spectrum context priors and task domain priors), dubbed\nCoColor. The complementary statistical and semantic spectrum information from\nthese two task domains -- in the forms of pre-trained colorization networks --\nare brought in as task domain priors. A bilateral domain translation module is\nsubsequently designed, in which intermittent NIR images are generated from\ngrayscale and colorized in parallel with authentic NIR images; and vice versa\nfor the grayscale images. These intermittent transformations act as latent\nspectrum context priors for efficient domain knowledge exchange. We\nprogressively fine-tune and fuse these modules with a series of pixel-level and\nfeature-level consistency constraints. Experiments show that our proposed\ncooperative learning framework produces satisfactory spectrum translation\noutputs with diverse colors and rich textures, and outperforms state-of-the-art\ncounterparts by 3.95dB and 4.66dB in terms of PNSR for the NIR and grayscale\ncolorization tasks, respectively.\n","authors":["Xingxing Yang","Jie Chen","Zaifeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.03348v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.03340v1","updated":"2023-08-07T06:47:36Z","published":"2023-08-07T06:47:36Z","title":"A Hybrid CNN-Transformer Architecture with Frequency Domain Contrastive\n  Learning for Image Deraining","summary":"  Image deraining is a challenging task that involves restoring degraded images\naffected by rain streaks.\n","authors":["Cheng Wang","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2308.03340v1.pdf","comment":"21 pages,6 figures"},{"id":"http://arxiv.org/abs/2209.10510v2","updated":"2023-08-07T06:40:13Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n  Synthetic-to-Real Adaptation","summary":"  Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v2.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n  pages, 25 figures, 7 tables. Project page:\n  https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2304.01198v2","updated":"2023-08-07T06:24:13Z","published":"2023-04-03T17:59:21Z","title":"Open-Vocabulary Semantic Segmentation with Decoupled One-Pass Network","summary":"  Recently, the open-vocabulary semantic segmentation problem has attracted\nincreasing attention and the best performing methods are based on two-stream\nnetworks: one stream for proposal mask generation and the other for segment\nclassification using a pretrained visual-language model. However, existing\ntwo-stream methods require passing a great number of (up to a hundred) image\ncrops into the visual-language model, which is highly inefficient. To address\nthe problem, we propose a network that only needs a single pass through the\nvisual-language model for each input image. Specifically, we first propose a\nnovel network adaptation approach, termed patch severance, to restrict the\nharmful interference between the patch embeddings in the pre-trained visual\nencoder. We then propose classification anchor learning to encourage the\nnetwork to spatially focus on more discriminative features for classification.\nExtensive experiments demonstrate that the proposed method achieves outstanding\nperformance, surpassing state-of-the-art methods while being 4 to 7 times\nfaster at inference. Code: https://github.com/CongHan0808/DeOP.git\n","authors":["Cong Han","Yujie Zhong","Dengjie Li","Kai Han","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2304.01198v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03322v1","updated":"2023-08-07T06:15:51Z","published":"2023-08-07T06:15:51Z","title":"Part-Aware Transformer for Generalizable Person Re-identification","summary":"  Domain generalization person re-identification (DG-ReID) aims to train a\nmodel on source domains and generalize well on unseen domains. Vision\nTransformer usually yields better generalization ability than common CNN\nnetworks under distribution shifts. However, Transformer-based ReID models\ninevitably over-fit to domain-specific biases due to the supervised learning\nstrategy on the source domain. We observe that while the global images of\ndifferent IDs should have different features, their similar local parts (e.g.,\nblack backpack) are not bounded by this constraint. Motivated by this, we\npropose a pure Transformer model (termed Part-aware Transformer) for DG-ReID by\ndesigning a proxy task, named Cross-ID Similarity Learning (CSL), to mine local\nvisual information shared by different IDs. This proxy task allows the model to\nlearn generic features because it only cares about the visual similarity of the\nparts regardless of the ID labels, thus alleviating the side effect of\ndomain-specific biases. Based on the local similarity obtained in CSL, a\nPart-guided Self-Distillation (PSD) is proposed to further improve the\ngeneralization of global features. Our method achieves state-of-the-art\nperformance under most DG ReID settings. Under the Market$\\to$Duke setting, our\nmethod exceeds state-of-the-art by 10.9% and 12.8% in Rank1 and mAP,\nrespectively. The code is available at\nhttps://github.com/liyuke65535/Part-Aware-Transformer.\n","authors":["Hao Ni","Yuke Li","Heng Tao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2308.03322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03321v1","updated":"2023-08-07T06:08:51Z","published":"2023-08-07T06:08:51Z","title":"AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework","summary":"  The success of deep learning is inseparable from normalization layers.\nResearchers have proposed various normalization functions, and each of them has\nboth advantages and disadvantages. In response, efforts have been made to\ndesign a unified normalization function that combines all normalization\nprocedures and mitigates their weaknesses. We also proposed a new normalization\nfunction called Adaptive Fusion Normalization. Through experiments, we\ndemonstrate AFN outperforms the previous normalization techniques in domain\ngeneralization and image classification tasks.\n","authors":["Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03321v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.01899 by other authors"},{"id":"http://arxiv.org/abs/2304.01199v2","updated":"2023-08-07T05:07:20Z","published":"2023-04-03T17:59:49Z","title":"On the Benefits of 3D Pose and Tracking for Human Action Recognition","summary":"  In this work we study the benefits of using tracking and 3D poses for action\nrecognition. To achieve this, we take the Lagrangian view on analysing actions\nover a trajectory of human motion rather than at a fixed point in space. Taking\nthis stand allows us to use the tracklets of people to predict their actions.\nIn this spirit, first we show the benefits of using 3D pose to infer actions,\nand study person-person interactions. Subsequently, we propose a Lagrangian\nAction Recognition model by fusing 3D pose and contextualized appearance over\ntracklets. To this end, our method achieves state-of-the-art performance on the\nAVA v2.2 dataset on both pose only settings and on standard benchmark settings.\nWhen reasoning about the action using only pose cues, our pose model achieves\n+10.0 mAP gain over the corresponding state-of-the-art while our fused model\nhas a gain of +2.8 mAP over the best state-of-the-art model. Code and results\nare available at: https://brjathu.github.io/LART\n","authors":["Jathushan Rajasegaran","Georgios Pavlakos","Angjoo Kanazawa","Christoph Feichtenhofer","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2304.01199v2.pdf","comment":"CVPR2023 (project page: https://brjathu.github.io/LART)"},{"id":"http://arxiv.org/abs/2106.14490v3","updated":"2023-08-07T04:47:05Z","published":"2021-06-28T09:09:14Z","title":"Making Images Real Again: A Comprehensive Survey on Deep Image\n  Composition","summary":"  As a common image editing operation, image composition aims to combine the\nforeground from one image and another background image, resulting in a\ncomposite image. However, there are many issues that could make the composite\nimages unrealistic. These issues can be summarized as the inconsistency between\nforeground and background, which includes appearance inconsistency (e.g.,\nincompatible illumination), geometry inconsistency (e.g., unreasonable size),\nand semantic inconsistency (e.g., mismatched semantic context). Image\ncomposition task could be decomposed into multiple sub-tasks, in which each\nsub-task targets at one or more issues. Specifically, object placement aims to\nfind reasonable scale, location, and shape for the foreground. Image blending\naims to address the unnatural boundary between foreground and background. Image\nharmonization aims to adjust the illumination statistics of foreground. Shadow\ngeneration aims to generate plausible shadow for the foreground. These\nsub-tasks can be executed sequentially or parallelly to acquire realistic\ncomposite images. To the best of our knowledge, there is no previous survey on\nimage composition. In this paper, we conduct comprehensive survey over the\nsub-tasks and combinatorial task of image composition. For each one, we\nsummarize the existing methods, available datasets, and common evaluation\nmetrics. Datasets and codes for image composition are summarized at\nhttps://github.com/bcmi/Awesome-Image-Composition.\n","authors":["Li Niu","Wenyan Cong","Liu Liu","Yan Hong","Bo Zhang","Jing Liang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2106.14490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16415v2","updated":"2023-08-07T04:29:12Z","published":"2023-07-31T05:48:39Z","title":"DDG-Net: Discriminability-Driven Graph Network for Weakly-supervised\n  Temporal Action Localization","summary":"  Weakly-supervised temporal action localization (WTAL) is a practical yet\nchallenging task. Due to large-scale datasets, most existing methods use a\nnetwork pretrained in other datasets to extract features, which are not\nsuitable enough for WTAL. To address this problem, researchers design several\nmodules for feature enhancement, which improve the performance of the\nlocalization module, especially modeling the temporal relationship between\nsnippets. However, all of them neglect the adverse effects of ambiguous\ninformation, which would reduce the discriminability of others. Considering\nthis phenomenon, we propose Discriminability-Driven Graph Network (DDG-Net),\nwhich explicitly models ambiguous snippets and discriminative snippets with\nwell-designed connections, preventing the transmission of ambiguous information\nand enhancing the discriminability of snippet-level representations.\nAdditionally, we propose feature consistency loss to prevent the assimilation\nof features and drive the graph convolution network to generate more\ndiscriminative representations. Extensive experiments on THUMOS14 and\nActivityNet1.2 benchmarks demonstrate the effectiveness of DDG-Net,\nestablishing new state-of-the-art results on both datasets. Source code is\navailable at \\url{https://github.com/XiaojunTang22/ICCV2023-DDGNet}.\n","authors":["Xiaojun Tang","Junsong Fan","Chuanchen Luo","Zhaoxiang Zhang","Man Zhang","Zongyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2307.16415v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03290v1","updated":"2023-08-07T04:17:19Z","published":"2023-08-07T04:17:19Z","title":"FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization\n  Search","summary":"  Quantization has become a mainstream compression technique for reducing model\nsize, computational requirements, and energy consumption for modern deep neural\nnetworks (DNNs). With the improved numerical support in recent hardware,\nincluding multiple variants of integer and floating point, mixed-precision\nquantization has become necessary to achieve high-quality results with low\nmodel cost. Prior mixed-precision quantization methods have performed a\npost-training quantization search, which compromises on accuracy, or a\ndifferentiable quantization search, which leads to high memory usage from\nbranching. Therefore, we propose the first one-shot mixed-precision\nquantization search that eliminates the need for retraining in both integer and\nlow-precision floating point models. We evaluate our floating-point and integer\nquantization search (FLIQS) on multiple convolutional networks and vision\ntransformer models to discover Pareto-optimal models. Our approach discovers\nmodels that improve upon uniform precision, manual mixed-precision, and recent\ninteger quantization search methods. With the proposed integer quantization\nsearch, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and\nResNet-50 by 0.90% points with equivalent model cost over previous methods.\nAdditionally, for the first time, we explore a novel mixed-precision\nfloating-point search and improve MobileNetV2 by up to 0.98% points compared to\nprior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously\nsearch a joint quantization and neural architecture space and improve the\nImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2\nsearch space.\n","authors":["Jordan Dotzel","Gang Wu","Andrew Li","Muhammad Umar","Yun Ni","Mohamed S. Abdelfattah","Zhiru Zhang","Liqun Cheng","Martin G. Dixon","Norman P. Jouppi","Quoc V. Le","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.03290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03286v1","updated":"2023-08-07T04:04:22Z","published":"2023-08-07T04:04:22Z","title":"Multi-Label Self-Supervised Learning with Scene Images","summary":"  Self-supervised learning (SSL) methods targeting scene images have seen a\nrapid growth recently, and they mostly rely on either a dedicated dense\nmatching mechanism or a costly unsupervised object discovery module. This paper\nshows that instead of hinging on these strenuous operations, quality image\nrepresentations can be learned by treating scene/multi-label image SSL simply\nas a multi-label classification problem, which greatly simplifies the learning\nframework. Specifically, multiple binary pseudo-labels are assigned for each\ninput image by comparing its embeddings with those in two dictionaries, and the\nnetwork is optimized using the binary cross entropy loss. The proposed method\nis named Multi-Label Self-supervised learning (MLS). Visualizations\nqualitatively show that clearly the pseudo-labels by MLS can automatically find\nsemantically similar pseudo-positive pairs across different images to\nfacilitate contrastive learning. MLS learns high quality representations on\nMS-COCO and achieves state-of-the-art results on classification, detection and\nsegmentation benchmarks. At the same time, MLS is much simpler than existing\nmethods, making it easier to deploy and for further exploration.\n","authors":["Ke Zhu","Minghao Fu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03282v1","updated":"2023-08-07T03:56:15Z","published":"2023-08-07T03:56:15Z","title":"Environment-Invariant Curriculum Relation Learning for Fine-Grained\n  Scene Graph Generation","summary":"  The scene graph generation (SGG) task is designed to identify the predicates\nbased on the subject-object pairs.However,existing datasets generally include\ntwo imbalance cases: one is the class imbalance from the predicted predicates\nand another is the context imbalance from the given subject-object pairs, which\npresents significant challenges for SGG. Most existing methods focus on the\nimbalance of the predicted predicate while ignoring the imbalance of the\nsubject-object pairs, which could not achieve satisfactory results. To address\nthe two imbalance cases, we propose a novel Environment Invariant Curriculum\nRelation learning (EICR) method, which can be applied in a plug-and-play\nfashion to existing SGG methods. Concretely, to remove the imbalance of the\nsubject-object pairs, we first construct different distribution environments\nfor the subject-object pairs and learn a model invariant to the environment\nchanges. Then, we construct a class-balanced curriculum learning strategy to\nbalance the different environments to remove the predicate imbalance.\nComprehensive experiments conducted on VG and GQA datasets demonstrate that our\nEICR framework can be taken as a general strategy for various SGG models, and\nachieve significant improvements.\n","authors":["Yukuan Min","Aming Wu","Cheng Deng"],"pdf_url":"https://arxiv.org/pdf/2308.03282v1.pdf","comment":"ICCV2023. arXiv admin note: text overlap with arXiv:2203.11654 by\n  other authors"},{"id":"http://arxiv.org/abs/2308.03280v1","updated":"2023-08-07T03:48:07Z","published":"2023-08-07T03:48:07Z","title":"Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with\n  Whitted-Style Ray Tracing","summary":"  Recently, Neural Radiance Fields (NeRF) has exhibited significant success in\nnovel view synthesis, surface reconstruction, etc. However, since no physical\nreflection is considered in its rendering pipeline, NeRF mistakes the\nreflection in the mirror as a separate virtual scene, leading to the inaccurate\nreconstruction of the mirror and multi-view inconsistent reflections in the\nmirror. In this paper, we present a novel neural rendering framework, named\nMirror-NeRF, which is able to learn accurate geometry and reflection of the\nmirror and support various scene manipulation applications with mirrors, such\nas adding new objects or mirrors into the scene and synthesizing the\nreflections of these new objects in mirrors, controlling mirror roughness, etc.\nTo achieve this goal, we propose a unified radiance field by introducing the\nreflection probability and tracing rays following the light transport model of\nWhitted Ray Tracing, and also develop several techniques to facilitate the\nlearning process. Experiments and comparisons on both synthetic and real\ndatasets demonstrate the superiority of our method. The code and supplementary\nmaterial are available on the project webpage:\nhttps://zju3dv.github.io/Mirror-NeRF/.\n","authors":["Junyi Zeng","Chong Bao","Rui Chen","Zilong Dong","Guofeng Zhang","Hujun Bao","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2308.03280v1.pdf","comment":"Accepted to ACM Multimedia 2023. Project Page:\n  https://zju3dv.github.io/Mirror-NeRF/"},{"id":"http://arxiv.org/abs/2308.03276v1","updated":"2023-08-07T03:35:47Z","published":"2023-08-07T03:35:47Z","title":"Spatialyze: A Geospatial Video Analytics System with Spatial-Aware\n  Optimizations","summary":"  Videos that are shot using commodity hardware such as phones and surveillance\ncameras record various metadata such as time and location. We encounter such\ngeospatial videos on a daily basis and such videos have been growing in volume\nsignificantly. Yet, we do not have data management systems that allow users to\ninteract with such data effectively.\n  In this paper, we describe Spatialyze, a new framework for end-to-end\nquerying of geospatial videos. Spatialyze comes with a domain-specific language\nwhere users can construct geospatial video analytic workflows using a 3-step,\ndeclarative, build-filter-observe paradigm. Internally, Spatialyze leverages\nthe declarative nature of such workflows, the temporal-spatial metadata stored\nwith videos, and physical behavior of real-world objects to optimize the\nexecution of workflows. Our results using real-world videos and workflows show\nthat Spatialyze can reduce execution time by up to 5.3x, while maintaining up\nto 97.1% accuracy compared to unoptimized execution.\n","authors":["Chanwut Kittivorawong","Yongming Ge","Yousef Helal","Alvin Cheung"],"pdf_url":"https://arxiv.org/pdf/2308.03276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03272v1","updated":"2023-08-07T03:27:04Z","published":"2023-08-07T03:27:04Z","title":"Feature-Suppressed Contrast for Self-Supervised Food Pre-training","summary":"  Most previous approaches for analyzing food images have relied on extensively\nannotated datasets, resulting in significant human labeling expenses due to the\nvaried and intricate nature of such images. Inspired by the effectiveness of\ncontrastive self-supervised methods in utilizing unlabelled data, weiqing\nexplore leveraging these techniques on unlabelled food images. In contrastive\nself-supervised methods, two views are randomly generated from an image by data\naugmentations. However, regarding food images, the two views tend to contain\nsimilar informative contents, causing large mutual information, which impedes\nthe efficacy of contrastive self-supervised learning. To address this problem,\nwe propose Feature Suppressed Contrast (FeaSC) to reduce mutual information\nbetween views. As the similar contents of the two views are salient or highly\nresponsive in the feature map, the proposed FeaSC uses a response-aware scheme\nto localize salient features in an unsupervised manner. By suppressing some\nsalient features in one view while leaving another contrast view unchanged, the\nmutual information between the two views is reduced, thereby enhancing the\neffectiveness of contrast learning for self-supervised food pre-training. As a\nplug-and-play module, the proposed method consistently improves BYOL and\nSimSiam by 1.70\\% $\\sim$ 6.69\\% classification accuracy on four publicly\navailable food recognition datasets. Superior results have also been achieved\non downstream segmentation tasks, demonstrating the effectiveness of the\nproposed method.\n","authors":["Xinda Liu","Yaohui Zhu","Linhu Liu","Jiang Tian","Lili Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11418v2","updated":"2023-08-07T03:18:31Z","published":"2023-07-21T08:22:14Z","title":"FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural\n  Radiance Fields","summary":"  As recent advances in Neural Radiance Fields (NeRF) have enabled\nhigh-fidelity 3D face reconstruction and novel view synthesis, its manipulation\nalso became an essential task in 3D vision. However, existing manipulation\nmethods require extensive human labor, such as a user-provided semantic mask\nand manual attribute search unsuitable for non-expert users. Instead, our\napproach is designed to require a single text to manipulate a face\nreconstructed with NeRF. To do so, we first train a scene manipulator, a latent\ncode-conditional deformable NeRF, over a dynamic scene to control a face\ndeformation using the latent code. However, representing a scene deformation\nwith a single latent code is unfavorable for compositing local deformations\nobserved in different instances. As so, our proposed Position-conditional\nAnchor Compositor (PAC) learns to represent a manipulated scene with spatially\nvarying latent codes. Their renderings with the scene manipulator are then\noptimized to yield high cosine similarity to a target text in CLIP embedding\nspace for text-driven manipulation. To the best of our knowledge, our approach\nis the first to address the text-driven manipulation of a face reconstructed\nwith NeRF. Extensive results, comparisons, and ablation studies demonstrate the\neffectiveness of our approach.\n","authors":["Sungwon Hwang","Junha Hyung","Daejin Kim","Min-Jung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2307.11418v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03267v1","updated":"2023-08-07T03:16:24Z","published":"2023-08-07T03:16:24Z","title":"Redundancy-aware Transformer for Video Question Answering","summary":"  This paper identifies two kinds of redundancy in the current VideoQA\nparadigm. Specifically, the current video encoders tend to holistically embed\nall video clues at different granularities in a hierarchical manner, which\ninevitably introduces \\textit{neighboring-frame redundancy} that can overwhelm\ndetailed visual clues at the object level. Subsequently, prevailing\nvision-language fusion designs introduce the \\textit{cross-modal redundancy} by\nexhaustively fusing all visual elements with question tokens without explicitly\ndifferentiating their pairwise vision-language interactions, thus making a\npernicious impact on the answering.\n  To this end, we propose a novel transformer-based architecture, that aims to\nmodel VideoQA in a redundancy-aware manner. To address the neighboring-frame\nredundancy, we introduce a video encoder structure that emphasizes the\nobject-level change in neighboring frames, while adopting an out-of-neighboring\nmessage-passing scheme that imposes attention only on distant frames. As for\nthe cross-modal redundancy, we equip our fusion module with a novel adaptive\nsampling, which explicitly differentiates the vision-language interactions by\nidentifying a small subset of visual elements that exclusively support the\nanswer. Upon these advancements, we find this\n\\underline{R}edundancy-\\underline{a}ware trans\\underline{former} (RaFormer) can\nachieve state-of-the-art results on multiple VideoQA benchmarks.\n","authors":["Yicong Li","Xun Yang","An Zhang","Chun Feng","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.03267v1.pdf","comment":"Accepted to ACM MM23"},{"id":"http://arxiv.org/abs/2207.01405v4","updated":"2023-08-07T03:11:49Z","published":"2022-07-04T13:37:38Z","title":"I-ViT: Integer-only Quantization for Efficient Vision Transformer\n  Inference","summary":"  Vision Transformers (ViTs) have achieved state-of-the-art performance on\nvarious computer vision applications. However, these models have considerable\nstorage and computational overheads, making their deployment and efficient\ninference on edge devices challenging. Quantization is a promising approach to\nreducing model complexity, and the dyadic arithmetic pipeline can allow the\nquantized models to perform efficient integer-only inference. Unfortunately,\ndyadic arithmetic is based on the homogeneity condition in convolutional neural\nnetworks, which is not applicable to the non-linear components in ViTs, making\ninteger-only inference of ViTs an open issue. In this paper, we propose I-ViT,\nan integer-only quantization scheme for ViTs, to enable ViTs to perform the\nentire computational graph of inference with integer arithmetic and\nbit-shifting, and without any floating-point arithmetic. In I-ViT, linear\noperations (e.g., MatMul and Dense) follow the integer-only pipeline with\ndyadic arithmetic, and non-linear operations (e.g., Softmax, GELU, and\nLayerNorm) are approximated by the proposed light-weight integer-only\narithmetic methods. More specifically, I-ViT applies the proposed Shiftmax and\nShiftGELU, which are designed to use integer bit-shifting to approximate the\ncorresponding floating-point operations. We evaluate I-ViT on various benchmark\nmodels and the results show that integer-only INT8 quantization achieves\ncomparable (or even slightly higher) accuracy to the full-precision (FP)\nbaseline. Furthermore, we utilize TVM for practical hardware deployment on the\nGPU's integer arithmetic units, achieving 3.72$\\sim$4.11$\\times$ inference\nspeedup compared to the FP model. Code of both Pytorch and TVM is released at\nhttps://github.com/zkkli/I-ViT.\n","authors":["Zhikai Li","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2207.01405v4.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2212.08632v2","updated":"2023-08-07T03:02:06Z","published":"2022-12-16T18:12:04Z","title":"Enhancing Multi-modal and Multi-hop Question Answering via Structured\n  Knowledge and Unified Retrieval-Generation","summary":"  Multi-modal multi-hop question answering involves answering a question by\nreasoning over multiple input sources from different modalities. Existing\nmethods often retrieve evidences separately and then use a language model to\ngenerate an answer based on the retrieved evidences, and thus do not adequately\nconnect candidates and are unable to model the interdependent relations during\nretrieval. Moreover, the pipelined approaches of retrieval and generation might\nresult in poor generation performance when retrieval performance is low. To\naddress these issues, we propose a Structured Knowledge and Unified\nRetrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion\nEncoder to align sources from different modalities using shared entities. It\nthen uses a unified Retrieval-Generation Decoder to integrate intermediate\nretrieval results for answer generation and also adaptively determine the\nnumber of retrieval steps. Extensive experiments on two representative\nmulti-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG\noutperforms the state-of-the-art models in both source retrieval and answer\ngeneration performance with fewer parameters. Our code is available at\nhttps://github.com/HITsz-TMG/SKURG.\n","authors":["Qian Yang","Qian Chen","Wen Wang","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.08632v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2212.08254v2","updated":"2023-08-07T03:00:41Z","published":"2022-12-16T02:52:37Z","title":"RepQ-ViT: Scale Reparameterization for Post-Training Quantization of\n  Vision Transformers","summary":"  Post-training quantization (PTQ), which only requires a tiny dataset for\ncalibration without end-to-end retraining, is a light and practical model\ncompression technique. Recently, several PTQ schemes for vision transformers\n(ViTs) have been presented; unfortunately, they typically suffer from\nnon-trivial accuracy degradation, especially in low-bit cases. In this paper,\nwe propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale\nreparameterization, to address the above issues. RepQ-ViT decouples the\nquantization and inference processes, where the former employs complex\nquantizers and the latter employs scale-reparameterized simplified quantizers.\nThis ensures both accurate quantization and efficient inference, which\ndistinguishes it from existing approaches that sacrifice quantization\nperformance to meet the target hardware. More specifically, we focus on two\ncomponents with extreme distributions: post-LayerNorm activations with severe\ninter-channel variation and post-Softmax activations with power-law features,\nand initially apply channel-wise quantization and log$\\sqrt{2}$ quantization,\nrespectively. Then, we reparameterize the scales to hardware-friendly\nlayer-wise quantization and log2 quantization for inference, with only slight\naccuracy or computational costs. Extensive experiments are conducted on\nmultiple vision tasks with different model variants, proving that RepQ-ViT,\nwithout hyperparameters and expensive reconstruction procedures, can outperform\nexisting strong baselines and encouragingly improve the accuracy of 4-bit PTQ\nof ViTs to a usable level. Code is available at\nhttps://github.com/zkkli/RepQ-ViT.\n","authors":["Zhikai Li","Junrui Xiao","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2212.08254v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03262v1","updated":"2023-08-07T02:57:48Z","published":"2023-08-07T02:57:48Z","title":"A Benchmark for Chinese-English Scene Text Image Super-resolution","summary":"  Scene Text Image Super-resolution (STISR) aims to recover high-resolution\n(HR) scene text images with visually pleasant and readable text content from\nthe given low-resolution (LR) input. Most existing works focus on recovering\nEnglish texts, which have relatively simple character structures, while little\nwork has been done on the more challenging Chinese texts with diverse and\ncomplex character structures. In this paper, we propose a real-world\nChinese-English benchmark dataset, namely Real-CE, for the task of STISR with\nthe emphasis on restoring structurally complex Chinese characters. The\nbenchmark provides 1,935/783 real-world LR-HR text image pairs~(contains 33,789\ntext lines in total) for training/testing in 2$\\times$ and 4$\\times$ zooming\nmodes, complemented by detailed annotations, including detection boxes and text\ntranscripts. Moreover, we design an edge-aware learning method, which provides\nstructural supervision in image and feature domains, to effectively reconstruct\nthe dense structures of Chinese characters. We conduct experiments on the\nproposed Real-CE benchmark and evaluate the existing STISR models with and\nwithout our edge-aware loss. The benchmark, including data and source code, is\navailable at https://github.com/mjq11302010044/Real-CE.\n","authors":["Jianqi Ma","Zhetong Liang","Wangmeng Xiang","Xi Yang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03262v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.02153v2","updated":"2023-08-07T02:33:21Z","published":"2023-08-04T06:20:20Z","title":"Robust Self-Supervised Extrinsic Self-Calibration","summary":"  Autonomous vehicles and robots need to operate over a wide variety of\nscenarios in order to complete tasks efficiently and safely. Multi-camera\nself-supervised monocular depth estimation from videos is a promising way to\nreason about the environment, as it generates metrically scaled geometric\npredictions from visual data without requiring additional sensors. However,\nmost works assume well-calibrated extrinsics to fully leverage this\nmulti-camera setup, even though accurate and efficient calibration is still a\nchallenging problem. In this work, we introduce a novel method for extrinsic\ncalibration that builds upon the principles of self-supervised monocular depth\nand ego-motion learning. Our proposed curriculum learning strategy uses\nmonocular depth and pose estimators with velocity supervision to estimate\nextrinsics, and then jointly learns extrinsic calibration along with depth and\npose for a set of overlapping cameras rigidly attached to a moving vehicle.\nExperiments on a benchmark multi-camera dataset (DDAD) demonstrate that our\nmethod enables self-calibration in various scenes robustly and efficiently\ncompared to a traditional vision-based pose estimation pipeline. Furthermore,\nwe demonstrate the benefits of extrinsics self-calibration as a way to improve\ndepth prediction via joint optimization.\n","authors":["Takayuki Kanai","Igor Vasiljevic","Vitor Guizilini","Adrien Gaidon","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2308.02153v2.pdf","comment":"Project page: https://sites.google.com/view/tri-sesc"},{"id":"http://arxiv.org/abs/2308.03258v1","updated":"2023-08-07T02:30:47Z","published":"2023-08-07T02:30:47Z","title":"APBench: A Unified Benchmark for Availability Poisoning Attacks and\n  Defenses","summary":"  The efficacy of availability poisoning, a method of poisoning data by\ninjecting imperceptible perturbations to prevent its use in model training, has\nbeen a hot subject of investigation. Previous research suggested that it was\ndifficult to effectively counteract such poisoning attacks. However, the\nintroduction of various defense methods has challenged this notion. Due to the\nrapid progress in this field, the performance of different novel methods cannot\nbe accurately validated due to variations in experimental setups. To further\nevaluate the attack and defense capabilities of these poisoning methods, we\nhave developed a benchmark -- APBench for assessing the efficacy of adversarial\npoisoning. APBench consists of 9 state-of-the-art availability poisoning\nattacks, 8 defense algorithms, and 4 conventional data augmentation techniques.\nWe also have set up experiments with varying different poisoning ratios, and\nevaluated the attacks on multiple datasets and their transferability across\nmodel architectures. We further conducted a comprehensive evaluation of 2\nadditional attacks specifically targeting unsupervised models. Our results\nreveal the glaring inadequacy of existing attacks in safeguarding individual\nprivacy. APBench is open source and available to the deep learning community:\nhttps://github.com/lafeat/apbench.\n","authors":["Tianrui Qin","Xitong Gao","Juanjuan Zhao","Kejiang Ye","Cheng-Zhong Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03256v1","updated":"2023-08-07T02:25:06Z","published":"2023-08-07T02:25:06Z","title":"Learning a Graph Neural Network with Cross Modality Interaction for\n  Image Fusion","summary":"  Infrared and visible image fusion has gradually proved to be a vital fork in\nthe field of multi-modality imaging technologies. In recent developments,\nresearchers not only focus on the quality of fused images but also evaluate\ntheir performance in downstream tasks. Nevertheless, the majority of methods\nseldom put their eyes on the mutual learning from different modalities,\nresulting in fused images lacking significant details and textures. To overcome\nthis issue, we propose an interactive graph neural network (GNN)-based\narchitecture between cross modality for fusion, called IGNet. Specifically, we\nfirst apply a multi-scale extractor to achieve shallow features, which are\nemployed as the necessary input to build graph structures. Then, the graph\ninteraction module can construct the extracted intermediate features of the\ninfrared/visible branch into graph structures. Meanwhile, the graph structures\nof two branches interact for cross-modality and semantic learning, so that\nfused images can maintain the important feature expressions and enhance the\nperformance of downstream tasks. Besides, the proposed leader nodes can improve\ninformation propagation in the same modality. Finally, we merge all graph\nfeatures to get the fusion result. Extensive experiments on different datasets\n(TNO, MFNet and M3FD) demonstrate that our IGNet can generate visually\nappealing fused images while scoring averagely 2.59% mAP@.5 and 7.77% mIoU\nhigher in detection and segmentation than the compared state-of-the-art\nmethods. The source code of the proposed IGNet can be available at\nhttps://github.com/lok-18/IGNet.\n","authors":["Jiawei Li","Jiansheng Chen","Jinyuan Liu","Huimin Ma"],"pdf_url":"https://arxiv.org/pdf/2308.03256v1.pdf","comment":"9 pages, 10 figures, ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03244v1","updated":"2023-08-07T01:43:25Z","published":"2023-08-07T01:43:25Z","title":"Mind the Gap: Improving Success Rate of Vision-and-Language Navigation\n  by Revisiting Oracle Success Routes","summary":"  Vision-and-Language Navigation (VLN) aims to navigate to the target location\nby following a given instruction. Unlike existing methods focused on predicting\na more accurate action at each step in navigation, in this paper, we make the\nfirst attempt to tackle a long-ignored problem in VLN: narrowing the gap\nbetween Success Rate (SR) and Oracle Success Rate (OSR). We observe a\nconsistently large gap (up to 9%) on four state-of-the-art VLN methods across\ntwo benchmark datasets: R2R and REVERIE. The high OSR indicates the robot agent\npasses the target location, while the low SR suggests the agent actually fails\nto stop at the target location at last. Instead of predicting actions directly,\nwe propose to mine the target location from a trajectory given by off-the-shelf\nVLN models. Specially, we design a multi-module transformer-based model for\nlearning compact discriminative trajectory viewpoint representation, which is\nused to predict the confidence of being a target location as described in the\ninstruction. The proposed method is evaluated on three widely-adopted datasets:\nR2R, REVERIE and NDH, and shows promising results, demonstrating the potential\nfor more future research.\n","authors":["Chongyang Zhao","Yuankai Qi","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12294v2","updated":"2023-08-07T01:21:19Z","published":"2022-12-23T12:51:42Z","title":"FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos","summary":"  Neural fields, also known as coordinate-based or implicit neural\nrepresentations, have shown a remarkable capability of representing,\ngenerating, and manipulating various forms of signals. For video\nrepresentations, however, mapping pixel-wise coordinates to RGB colors has\nshown relatively low compression performance and slow convergence and inference\nspeed. Frame-wise video representation, which maps a temporal coordinate to its\nentire frame, has recently emerged as an alternative method to represent\nvideos, improving compression rates and encoding speed. While promising, it has\nstill failed to reach the performance of state-of-the-art video compression\nalgorithms. In this work, we propose FFNeRV, a novel method for incorporating\nflow information into frame-wise representations to exploit the temporal\nredundancy across the frames in videos inspired by the standard video codecs.\nFurthermore, we introduce a fully convolutional architecture, enabled by\none-dimensional temporal grids, improving the continuity of spatial features.\nExperimental results show that FFNeRV yields the best performance for video\ncompression and frame interpolation among the methods using frame-wise\nrepresentations or neural fields. To reduce the model size even further, we\ndevise a more compact convolutional architecture using the group and pointwise\nconvolutions. With model compression techniques, including quantization-aware\ntraining and entropy coding, FFNeRV outperforms widely-used standard video\ncodecs (H.264 and HEVC) and performs on par with state-of-the-art video\ncompression algorithms.\n","authors":["Joo Chan Lee","Daniel Rho","Jong Hwan Ko","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2212.12294v2.pdf","comment":"Our project page including code is available at\n  https://maincold2.github.io/ffnerv/"},{"id":"http://arxiv.org/abs/2206.02659v5","updated":"2023-08-07T01:20:01Z","published":"2022-06-06T14:52:46Z","title":"Robust Fine-Tuning of Deep Neural Networks with Hessian-based\n  Generalization Guarantees","summary":"  We consider fine-tuning a pretrained deep neural network on a target task. We\nstudy the generalization properties of fine-tuning to understand the problem of\noverfitting, which has often been observed (e.g., when the target dataset is\nsmall or when the training labels are noisy). Existing generalization measures\nfor deep networks depend on notions such as distance from the initialization\n(i.e., the pretrained network) of the fine-tuned model and noise stability\nproperties of deep networks. This paper identifies a Hessian-based distance\nmeasure through PAC-Bayesian analysis, which is shown to correlate well with\nobserved generalization gaps of fine-tuned models. Theoretically, we prove\nHessian distance-based generalization bounds for fine-tuned models. We also\ndescribe an extended study of fine-tuning against label noise, where\noverfitting is against a critical problem; We present an algorithm and a\ngeneralization error guarantee for this algorithm under a class conditional\nindependent noise model. Empirically, we observe that the Hessian-based\ndistance measure can match the scale of the observed generalization gap of\nfine-tuned models in practice. We also test our algorithm on several image\nclassification tasks with noisy training labels, showing notable gains over\nprior methods, and the Hessian distance measure of the fine-tuned model\ndecreases substantially.\n","authors":["Haotian Ju","Dongyue Li","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.02659v5.pdf","comment":"37 pages. Appeared in ICML 2022"},{"id":"http://arxiv.org/abs/2308.03950v1","updated":"2023-08-07T23:41:55Z","published":"2023-08-07T23:41:55Z","title":"Zero-shot Skeleton-based Action Recognition via Mutual Information\n  Estimation and Maximization","summary":"  Zero-shot skeleton-based action recognition aims to recognize actions of\nunseen categories after training on data of seen categories. The key is to\nbuild the connection between visual and semantic space from seen to unseen\nclasses. Previous studies have primarily focused on encoding sequences into a\nsingular feature vector, with subsequent mapping the features to an identical\nanchor point within the embedded space. Their performance is hindered by 1) the\nignorance of the global visual/semantic distribution alignment, which results\nin a limitation to capture the true interdependence between the two spaces. 2)\nthe negligence of temporal information since the frame-wise features with rich\naction clues are directly pooled into a single feature vector. We propose a new\nzero-shot skeleton-based action recognition method via mutual information (MI)\nestimation and maximization. Specifically, 1) we maximize the MI between visual\nand semantic space for distribution alignment; 2) we leverage the temporal\ninformation for estimating the MI by encouraging MI to increase as more frames\nare observed. Extensive experiments on three large-scale skeleton action\ndatasets confirm the effectiveness of our method. Code:\nhttps://github.com/YujieOuO/SMIE.\n","authors":["Yujie Zhou","Wenwen Qiang","Anyi Rao","Ning Lin","Bing Su","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03950v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2204.11041v2","updated":"2023-08-07T22:47:07Z","published":"2022-04-23T10:19:58Z","title":"Learning by Erasing: Conditional Entropy based Transferable\n  Out-Of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection is essential to handle the distribution\nshifts between training and test scenarios. For a new in-distribution (ID)\ndataset, existing methods require retraining to capture the dataset-specific\nfeature representation or data distribution. In this paper, we propose a deep\ngenerative models (DGM) based transferable OOD detection method, which is\nunnecessary to retrain on a new ID dataset. We design an image erasing strategy\nto equip exclusive conditional entropy distribution for each ID dataset, which\ndetermines the discrepancy of DGM's posteriori ucertainty distribution on\ndifferent ID datasets. Owing to the powerful representation capacity of\nconvolutional neural networks, the proposed model trained on complex dataset\ncan capture the above discrepancy between ID datasets without retraining and\nthus achieve transferable OOD detection. We validate the proposed method on\nfive datasets and verity that ours achieves comparable performance to the\nstate-of-the-art group based OOD detection methods that need to be retrained to\ndeploy on new ID datasets. Our code is available at\nhttps://github.com/oOHCIOo/CETOOD.\n","authors":["Meng Xing","Zhiyong Feng","Yong Su","Changjae Oh"],"pdf_url":"https://arxiv.org/pdf/2204.11041v2.pdf","comment":"update new experimental results"},{"id":"http://arxiv.org/abs/2308.03939v1","updated":"2023-08-07T22:44:26Z","published":"2023-08-07T22:44:26Z","title":"Deterministic Neural Illumination Mapping for Efficient Auto-White\n  Balance Correction","summary":"  Auto-white balance (AWB) correction is a critical operation in image signal\nprocessors for accurate and consistent color correction across various\nillumination scenarios. This paper presents a novel and efficient AWB\ncorrection method that achieves at least 35 times faster processing with\nequivalent or superior performance on high-resolution images for the current\nstate-of-the-art methods. Inspired by deterministic color style transfer, our\napproach introduces deterministic illumination color mapping, leveraging\nlearnable projection matrices for both canonical illumination form and\nAWB-corrected output. It involves feeding high-resolution images and\ncorresponding latent representations into a mapping module to derive a\ncanonical form, followed by another mapping module that maps the pixel values\nto those for the corrected version. This strategy is designed as\nresolution-agnostic and also enables seamless integration of any pre-trained\nAWB network as the backbone. Experimental results confirm the effectiveness of\nour approach, revealing significant performance improvements and reduced time\ncomplexity compared to state-of-the-art methods. Our method provides an\nefficient deep learning-based AWB correction solution, promising real-time,\nhigh-quality color correction for digital imaging applications. Source code is\navailable at https://github.com/birdortyedi/DeNIM/\n","authors":["Furkan Kınlı","Doğa Yılmaz","Barış Özcan","Furkan Kıraç"],"pdf_url":"https://arxiv.org/pdf/2308.03939v1.pdf","comment":"9 pages, 5 figures, ICCV 2023 Workshops (RCV 2023)"},{"id":"http://arxiv.org/abs/2308.03936v1","updated":"2023-08-07T22:39:44Z","published":"2023-08-07T22:39:44Z","title":"ALFA -- Leveraging All Levels of Feature Abstraction for Enhancing the\n  Generalization of Histopathology Image Classification Across Unseen Hospitals","summary":"  We propose an exhaustive methodology that leverages all levels of feature\nabstraction, targeting an enhancement in the generalizability of image\nclassification to unobserved hospitals. Our approach incorporates\naugmentation-based self-supervision with common distribution shifts in\nhistopathology scenarios serving as the pretext task. This enables us to derive\ninvariant features from training images without relying on training labels,\nthereby covering different abstraction levels. Moving onto the subsequent\nabstraction level, we employ a domain alignment module to facilitate further\nextraction of invariant features across varying training hospitals. To\nrepresent the highly specific features of participating hospitals, an encoder\nis trained to classify hospital labels, independent of their diagnostic labels.\nThe features from each of these encoders are subsequently disentangled to\nminimize redundancy and segregate the features. This representation, which\nspans a broad spectrum of semantic information, enables the development of a\nmodel demonstrating increased robustness to unseen images from disparate\ndistributions. Experimental results from the PACS dataset (a domain\ngeneralization benchmark), a synthetic dataset created by applying\nhistopathology-specific jitters to the MHIST dataset (defining different\ndomains with varied distribution shifts), and a Renal Cell Carcinoma dataset\nderived from four image repositories from TCGA, collectively indicate that our\nproposed model is adept at managing varying levels of image granularity. Thus,\nit shows improved generalizability when faced with new, out-of-distribution\nhospital images.\n","authors":["Milad Sikaroudi","Shahryar Rahnamayan","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2308.03936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01248v2","updated":"2023-08-07T22:21:24Z","published":"2022-04-04T05:27:40Z","title":"Differentiable Rendering for Synthetic Aperture Radar Imagery","summary":"  There is rising interest in differentiable rendering, which allows explicitly\nmodeling geometric priors and constraints in optimization pipelines using\nfirst-order methods such as backpropagation. Incorporating such domain\nknowledge can lead to deep neural networks that are trained more robustly and\nwith limited data, as well as the capability to solve ill-posed inverse\nproblems. Existing efforts in differentiable rendering have focused on imagery\nfrom electro-optical sensors, particularly conventional RGB-imagery. In this\nwork, we propose an approach for differentiable rendering of Synthetic Aperture\nRadar (SAR) imagery, which combines methods from 3D computer graphics with\nneural rendering. We demonstrate the approach on the inverse graphics problem\nof 3D Object Reconstruction from limited SAR imagery using high-fidelity\nsimulated SAR data.\n","authors":["Michael Wilmanski","Jonathan Tamir"],"pdf_url":"https://arxiv.org/pdf/2204.01248v2.pdf","comment":"This version of the manuscript is an updated preprint which has been\n  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but\n  has not yet been published or processed by IEEE"},{"id":"http://arxiv.org/abs/2307.16074v2","updated":"2023-08-07T22:11:33Z","published":"2023-07-29T20:46:44Z","title":"Iterative Graph Filtering Network for 3D Human Pose Estimation","summary":"  Graph convolutional networks (GCNs) have proven to be an effective approach\nfor 3D human pose estimation. By naturally modeling the skeleton structure of\nthe human body as a graph, GCNs are able to capture the spatial relationships\nbetween joints and learn an efficient representation of the underlying pose.\nHowever, most GCN-based methods use a shared weight matrix, making it\nchallenging to accurately capture the different and complex relationships\nbetween joints. In this paper, we introduce an iterative graph filtering\nframework for 3D human pose estimation, which aims to predict the 3D joint\npositions given a set of 2D joint locations in images. Our approach builds upon\nthe idea of iteratively solving graph filtering with Laplacian regularization\nvia the Gauss-Seidel iterative method. Motivated by this iterative solution, we\ndesign a Gauss-Seidel network (GS-Net) architecture, which makes use of weight\nand adjacency modulation, skip connection, and a pure convolutional block with\nlayer normalization. Adjacency modulation facilitates the learning of edges\nthat go beyond the inherent connections of body joints, resulting in an\nadjusted graph structure that reflects the human skeleton, while skip\nconnections help maintain crucial information from the input layer's initial\nfeatures as the network depth increases. We evaluate our proposed model on two\nstandard benchmark datasets, and compare it with a comprehensive set of strong\nbaseline methods for 3D human pose estimation. Our experimental results\ndemonstrate that our approach outperforms the baseline methods on both\ndatasets, achieving state-of-the-art performance. Furthermore, we conduct\nablation studies to analyze the contributions of different components of our\nmodel architecture and show that the skip connection and adjacency modulation\nhelp improve the model performance.\n","authors":["Zaedul Islam","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2307.16074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05116v2","updated":"2023-08-07T21:50:04Z","published":"2022-12-09T20:45:09Z","title":"Leveraging Contextual Data Augmentation for Generalizable Melanoma\n  Detection","summary":"  While skin cancer detection has been a valuable deep learning application for\nyears, its evaluation has often neglected the context in which testing images\nare assessed. Traditional melanoma classifiers assume that their testing\nenvironments are comparable to the structured images they are trained on. This\npaper challenges this notion and argues that mole size, a critical attribute in\nprofessional dermatology, can be misleading in automated melanoma detection.\nWhile malignant melanomas tend to be larger than benign melanomas, relying\nsolely on size can be unreliable and even harmful when contextual scaling of\nimages is not possible. To address this issue, this implementation proposes a\ncustom model that performs various data augmentation procedures to prevent\noverfitting to incorrect parameters and simulate real-world usage of melanoma\ndetection applications. Multiple custom models employing different forms of\ndata augmentation are implemented to highlight the most significant features of\nmole classifiers. These implementations emphasize the importance of considering\nuser unpredictability when deploying such applications. The caution required\nwhen manually modifying data is acknowledged, as it can result in data loss and\nbiased conclusions. Additionally, the significance of data augmentation in both\nthe dermatology and deep learning communities is considered.\n","authors":["Nick DiSanto","Gavin Harding","Ethan Martinez","Benjamin Sanders"],"pdf_url":"https://arxiv.org/pdf/2212.05116v2.pdf","comment":"6 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.03908v1","updated":"2023-08-07T20:50:54Z","published":"2023-08-07T20:50:54Z","title":"ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings\n  for Video Action Recognition","summary":"  Video Action Recognition (VAR) is a challenging task due to its inherent\ncomplexities. Though different approaches have been explored in the literature,\ndesigning a unified framework to recognize a large number of human actions is\nstill a challenging problem. Recently, Multi-Modal Learning (MML) has\ndemonstrated promising results in this domain. In literature, 2D skeleton or\npose modality has often been used for this task, either independently or in\nconjunction with the visual information (RGB modality) present in videos.\nHowever, the combination of pose, visual information, and text attributes has\nnot been explored yet, though text and pose attributes independently have been\nproven to be effective in numerous computer vision tasks. In this paper, we\npresent the first pose augmented Vision-language model (VLM) for VAR. Notably,\nour scheme achieves an accuracy of 92.81% and 73.02% on two popular human video\naction recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even\nwithout any video data pre-training, and an accuracy of 96.11% and 75.75% after\nkinetics pre-training.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.03908v1.pdf","comment":"7 pages, 3 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2308.03906v1","updated":"2023-08-07T20:48:07Z","published":"2023-08-07T20:48:07Z","title":"TIJO: Trigger Inversion with Joint Optimization for Defending Multimodal\n  Backdoored Models","summary":"  We present a Multimodal Backdoor Defense technique TIJO (Trigger Inversion\nusing Joint Optimization). Recent work arXiv:2112.07668 has demonstrated\nsuccessful backdoor attacks on multimodal models for the Visual Question\nAnswering task. Their dual-key backdoor trigger is split across two modalities\n(image and text), such that the backdoor is activated if and only if the\ntrigger is present in both modalities. We propose TIJO that defends against\ndual-key attacks through a joint optimization that reverse-engineers the\ntrigger in both the image and text modalities. This joint optimization is\nchallenging in multimodal models due to the disconnected nature of the visual\npipeline which consists of an offline feature extractor, whose output is then\nfused with the text using a fusion module. The key insight enabling the joint\noptimization in TIJO is that the trigger inversion needs to be carried out in\nthe object detection box feature space as opposed to the pixel space. We\ndemonstrate the effectiveness of our method on the TrojVQA benchmark, where\nTIJO improves upon the state-of-the-art unimodal methods from an AUC of 0.6 to\n0.92 on multimodal dual-key backdoors. Furthermore, our method also improves\nupon the unimodal baselines on unimodal backdoors. We present ablation studies\nand qualitative results to provide insights into our algorithm such as the\ncritical importance of overlaying the inverted feature triggers on all visual\nfeatures during trigger inversion. The prototype implementation of TIJO is\navailable at https://github.com/SRI-CSL/TIJO.\n","authors":["Indranil Sur","Karan Sikka","Matthew Walmer","Kaushik Koneripalli","Anirban Roy","Xiao Lin","Ajay Divakaran","Susmit Jha"],"pdf_url":"https://arxiv.org/pdf/2308.03906v1.pdf","comment":"Published as conference paper at ICCV 2023. 13 pages, 6 figures, 7\n  tables"},{"id":"http://arxiv.org/abs/2308.03900v1","updated":"2023-08-07T20:23:39Z","published":"2023-08-07T20:23:39Z","title":"Developability Approximation for Neural Implicits through Rank\n  Minimization","summary":"  Developability refers to the process of creating a surface without any\ntearing or shearing from a two-dimensional plane. It finds practical\napplications in the fabrication industry. An essential characteristic of a\ndevelopable 3D surface is its zero Gaussian curvature, which means that either\none or both of the principal curvatures are zero. This paper introduces a\nmethod for reconstructing an approximate developable surface from a neural\nimplicit surface. The central idea of our method involves incorporating a\nregularization term that operates on the second-order derivatives of the neural\nimplicits, effectively promoting zero Gaussian curvature. Implicit surfaces\noffer the advantage of smoother deformation with infinite resolution,\novercoming the high polygonal constraints of state-of-the-art methods using\ndiscrete representations. We draw inspiration from the properties of surface\ncurvature and employ rank minimization techniques derived from compressed\nsensing. Experimental results on both developable and non-developable surfaces,\nincluding those affected by noise, validate the generalizability of our method.\n","authors":["Pratheba Selvaraju"],"pdf_url":"https://arxiv.org/pdf/2308.03900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12622v4","updated":"2023-08-07T20:10:51Z","published":"2023-07-24T08:51:49Z","title":"Phase Matching for Out-of-Distribution Generalization","summary":"  The Fourier transform, serving as an explicit decomposition method for visual\nsignals, has been employed to explain the out-of-distribution generalization\nbehaviors of Convolutional Neural Networks (CNNs). Previous studies have\nindicated that the amplitude spectrum is susceptible to the disturbance caused\nby distribution shifts. On the other hand, the phase spectrum preserves\nhighly-structured spatial information, which is crucial for robust visual\nrepresentation learning. However, the spatial relationships of phase spectrum\nremain unexplored in previous researches. In this paper, we aim to clarify the\nrelationships between Domain Generalization (DG) and the frequency components,\nand explore the spatial relationships of the phase spectrum. Specifically, we\nfirst introduce a Fourier-based structural causal model which interprets the\nphase spectrum as semi-causal factors and the amplitude spectrum as non-causal\nfactors. Then, we propose Phase Matching (PhaMa) to address DG problems. Our\nmethod introduces perturbations on the amplitude spectrum and establishes\nspatial relationships to match the phase components. Through experiments on\nmultiple benchmarks, we demonstrate that our proposed method achieves\nstate-of-the-art performance in domain generalization and out-of-distribution\nrobustness tasks.\n","authors":["Chengming Hu","Yeqian Du","Rui Wang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12622v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00501v4","updated":"2023-08-07T19:10:18Z","published":"2023-04-02T10:27:34Z","title":"A Comprehensive Review of YOLO: From YOLOv1 and Beyond","summary":"  YOLO has become a central real-time object detection system for robotics,\ndriverless cars, and video monitoring applications. We present a comprehensive\nanalysis of YOLO's evolution, examining the innovations and contributions in\neach iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with\nTransformers. We start by describing the standard metrics and postprocessing;\nthen, we discuss the major changes in network architecture and training tricks\nfor each model. Finally, we summarize the essential lessons from YOLO's\ndevelopment and provide a perspective on its future, highlighting potential\nresearch directions to enhance real-time object detection systems.\n","authors":["Juan Terven","Diana Cordova-Esparza"],"pdf_url":"https://arxiv.org/pdf/2304.00501v4.pdf","comment":"34 pages, 19 figures, 4 tables, submitted to ACM Computing Surveys.\n  This version adds information about YOLO with transformers"},{"id":"http://arxiv.org/abs/2308.03867v1","updated":"2023-08-07T18:39:14Z","published":"2023-08-07T18:39:14Z","title":"From Sky to the Ground: A Large-scale Benchmark and Simple Baseline\n  Towards Real Rain Removal","summary":"  Learning-based image deraining methods have made great progress. However, the\nlack of large-scale high-quality paired training samples is the main bottleneck\nto hamper the real image deraining (RID). To address this dilemma and advance\nRID, we construct a Large-scale High-quality Paired real rain benchmark\n(LHP-Rain), including 3000 video sequences with 1 million high-resolution\n(1920*1080) frame pairs. The advantages of the proposed dataset over the\nexisting ones are three-fold: rain with higher-diversity and larger-scale,\nimage with higher-resolution and higher-quality ground-truth. Specifically, the\nreal rains in LHP-Rain not only contain the classical rain\nstreak/veiling/occlusion in the sky, but also the \\textbf{splashing on the\nground} overlooked by deraining community. Moreover, we propose a novel robust\nlow-rank tensor recovery model to generate the GT with better separating the\nstatic background from the dynamic rain. In addition, we design a simple\ntransformer-based single image deraining baseline, which simultaneously utilize\nthe self-attention and cross-layer attention within the image and rain layer\nwith discriminative feature representation. Extensive experiments verify the\nsuperiority of the proposed dataset and deraining method over state-of-the-art.\n","authors":["Yun Guo","Xueyao Xiao","Yi Chang","Shumin Deng","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2308.03867v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03865v1","updated":"2023-08-07T18:27:04Z","published":"2023-08-07T18:27:04Z","title":"DefCor-Net: Physics-Aware Ultrasound Deformation Correction","summary":"  The recovery of morphologically accurate anatomical images from deformed ones\nis challenging in ultrasound (US) image acquisition, but crucial to accurate\nand consistent diagnosis, particularly in the emerging field of\ncomputer-assisted diagnosis. This article presents a novel anatomy-aware\ndeformation correction approach based on a coarse-to-fine, multi-scale deep\nneural network (DefCor-Net). To achieve pixel-wise performance, DefCor-Net\nincorporates biomedical knowledge by estimating pixel-wise stiffness online\nusing a U-shaped feature extractor. The deformation field is then computed\nusing polynomial regression by integrating the measured force applied by the US\nprobe. Based on real-time estimation of pixel-by-pixel tissue properties, the\nlearning-based approach enables the potential for anatomy-aware deformation\ncorrection. To demonstrate the effectiveness of the proposed DefCor-Net, images\nrecorded at multiple locations on forearms and upper arms of six volunteers are\nused to train and validate DefCor-Net. The results demonstrate that DefCor-Net\ncan significantly improve the accuracy of deformation correction to recover the\noriginal geometry (Dice Coefficient: from $14.3\\pm20.9$ to $82.6\\pm12.1$ when\nthe force is $6N$).\n","authors":["Zhongliang Jiang","Yue Zhou","Dongliang Cao","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2308.03865v1.pdf","comment":"Accepted by MedIA. code is available"},{"id":"http://arxiv.org/abs/2308.03861v1","updated":"2023-08-07T18:15:03Z","published":"2023-08-07T18:15:03Z","title":"High-Throughput and Accurate 3D Scanning of Cattle Using Time-of-Flight\n  Sensors and Deep Learning","summary":"  We introduce a high throughput 3D scanning solution specifically designed to\nprecisely measure cattle phenotypes. This scanner leverages an array of depth\nsensors, i.e. time-of-flight (Tof) sensors, each governed by dedicated embedded\ndevices. The system excels at generating high-fidelity 3D point clouds, thus\nfacilitating an accurate mesh that faithfully reconstructs the cattle geometry\non the fly. In order to evaluate the performance of our system, we have\nimplemented a two-fold validation process. Initially, we test the scanner's\ncompetency in determining volume and surface area measurements within a\ncontrolled environment featuring known objects. Secondly, we explore the impact\nand necessity of multi-device synchronization when operating a series of\ntime-of-flight sensors. Based on the experimental results, the proposed system\nis capable of producing high-quality meshes of untamed cattle for livestock\nstudies.\n","authors":["Gbenga Omotara","Seyed Mohamad Ali Tousi","Jared Decker","Derek Brake","Guilherme N. DeSouza"],"pdf_url":"https://arxiv.org/pdf/2308.03861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03826v1","updated":"2023-08-07T17:49:04Z","published":"2023-08-07T17:49:04Z","title":"Recurrent Multi-scale Transformer for High-Resolution Salient Object\n  Detection","summary":"  Salient Object Detection (SOD) aims to identify and segment the most\nconspicuous objects in an image or video. As an important pre-processing step,\nit has many potential applications in multimedia and vision tasks. With the\nadvance of imaging devices, SOD with high-resolution images is of great demand,\nrecently. However, traditional SOD methods are largely limited to\nlow-resolution images, making them difficult to adapt to the development of\nHigh-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no\nlarge enough datasets for training and evaluating. Besides, current HRSOD\nmethods generally produce incomplete object regions and irregular object\nboundaries. To address above issues, in this work, we first propose a new\nHRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K\nresolution. As far as we know, it is the largest dataset for the HRSOD task,\nwhich will significantly help future works in training and evaluating models.\nFurthermore, to improve the HRSOD performance, we propose a novel Recurrent\nMulti-scale Transformer (RMFormer), which recurrently utilizes shared\nTransformers and multi-scale refinement architectures. Thus, high-resolution\nsaliency maps can be generated with the guidance of lower-resolution\npredictions. Extensive experiments on both high-resolution and low-resolution\nbenchmarks show the effectiveness and superiority of the proposed framework.\nThe source code and dataset are released at:\nhttps://github.com/DrowsyMon/RMFormer.\n","authors":["Xinhao Deng","Pingping Zhang","Wei Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03826v1.pdf","comment":"This work is accepted by ACM MM2023. More modifications may be\n  performed for further improvements"},{"id":"http://arxiv.org/abs/2308.03821v1","updated":"2023-08-07T15:30:02Z","published":"2023-08-07T15:30:02Z","title":"Distributionally Robust Classification on a Data Budget","summary":"  Real world uses of deep learning require predictable model behavior under\ndistribution shifts. Models such as CLIP show emergent natural distributional\nrobustness comparable to humans, but may require hundreds of millions of\ntraining samples. Can we train robust learners in a domain where data is\nlimited? To rigorously address this question, we introduce JANuS (Joint\nAnnotations and Names Set), a collection of four new training datasets with\nimages, labels, and corresponding captions, and perform a series of carefully\ncontrolled investigations of factors contributing to robustness in image\nclassification, then compare those results to findings derived from a\nlarge-scale meta-analysis. Using this approach, we show that standard ResNet-50\ntrained with the cross-entropy loss on 2.4 million image samples can attain\ncomparable robustness to a CLIP ResNet-50 trained on 400 million samples. To\nour knowledge, this is the first result showing (near) state-of-the-art\ndistributional robustness on limited data budgets. Our dataset is available at\n\\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used\nto reproduce our experiments can be found at\n\\url{https://github.com/penfever/vlhub/}.\n","authors":["Benjamin Feuer","Ameya Joshi","Minh Pham","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2308.03821v1.pdf","comment":"TMLR 2023; openreview link:\n  https://openreview.net/forum?id=D5Z2E8CNsD"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2308.03735v1","updated":"2023-08-07T17:34:58Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v1.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2308.03734v1","updated":"2023-08-07T17:32:33Z","published":"2023-08-07T17:32:33Z","title":"Labeling without Seeing? Blind Annotation for Privacy-Preserving Entity\n  Resolution","summary":"  The entity resolution problem requires finding pairs across datasets that\nbelong to different owners but refer to the same entity in the real world. To\ntrain and evaluate solutions (either rule-based or machine-learning-based) to\nthe entity resolution problem, generating a ground truth dataset with entity\npairs or clusters is needed. However, such a data annotation process involves\nhumans as domain oracles to review the plaintext data for all candidate record\npairs from different parties, which inevitably infringes the privacy of data\nowners, especially in privacy-sensitive cases like medical records. To the best\nof our knowledge, there is no prior work on privacy-preserving ground truth\ndataset generation, especially in the domain of entity resolution. We propose a\nnovel blind annotation protocol based on homomorphic encryption that allows\ndomain oracles to collaboratively label ground truths without sharing data in\nplaintext with other parties. In addition, we design a domain-specific\neasy-to-use language that hides the sophisticated underlying homomorphic\nencryption layer. Rigorous proof of the privacy guarantee is provided and our\nempirical experiments via an annotation simulator indicate the feasibility of\nour privacy-preserving protocol (f-measure on average achieves more than 90\\%\ncompared with the real ground truths).\n","authors":["Yixiang Yao","Weizhao Jin","Srivatsan Ravi"],"pdf_url":"https://arxiv.org/pdf/2308.03734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03588v1","updated":"2023-08-07T13:45:48Z","published":"2023-08-07T13:45:48Z","title":"Multi-View Graph Convolutional Network for Multimedia Recommendation","summary":"  Multimedia recommendation has received much attention in recent years. It\nmodels user preferences based on both behavior information and item multimodal\ninformation. Though current GCN-based methods achieve notable success, they\nsuffer from two limitations: (1) Modality noise contamination to the item\nrepresentations. Existing methods often mix modality features and behavior\nfeatures in a single view (e.g., user-item view) for propagation, the noise in\nthe modality features may be amplified and coupled with behavior features. In\nthe end, it leads to poor feature discriminability; (2) Incomplete user\npreference modeling caused by equal treatment of modality features. Users often\nexhibit distinct modality preferences when purchasing different items. Equally\nfusing each modality feature ignores the relative importance among different\nmodalities, leading to the suboptimal user preference modeling. To tackle the\nabove issues, we propose a novel Multi-View Graph Convolutional Network for the\nmultimedia recommendation. Specifically, to avoid modality noise contamination,\nthe modality features are first purified with the aid of item behavior\ninformation. Then, the purified modality features of items and behavior\nfeatures are enriched in separate views, including the user-item view and the\nitem-item view. In this way, the distinguishability of features is enhanced.\nMeanwhile, a behavior-aware fuser is designed to comprehensively model user\npreferences by adaptively learning the relative importance of different\nmodality features. Furthermore, we equip the fuser with a self-supervised\nauxiliary task. This task is expected to maximize the mutual information\nbetween the fused multimodal features and behavior features, so as to capture\ncomplementary and supplementary preference information simultaneously.\nExtensive experiments on three public datasets demonstrate the effectiveness of\nour methods.\n","authors":["Penghang Yu","Zhiyi Tan","Guanming Lu","Bing-Kun Bao"],"pdf_url":"https://arxiv.org/pdf/2308.03588v1.pdf","comment":"MM'23"},{"id":"http://arxiv.org/abs/2308.03578v1","updated":"2023-08-07T13:35:02Z","published":"2023-08-07T13:35:02Z","title":"TeraHAC: Hierarchical Agglomerative Clustering of Trillion-Edge Graphs","summary":"  We introduce TeraHAC, a $(1+\\epsilon)$-approximate hierarchical agglomerative\nclustering (HAC) algorithm which scales to trillion-edge graphs. Our algorithm\nis based on a new approach to computing $(1+\\epsilon)$-approximate HAC, which\nis a novel combination of the nearest-neighbor chain algorithm and the notion\nof $(1+\\epsilon)$-approximate HAC. Our approach allows us to partition the\ngraph among multiple machines and make significant progress in computing the\nclustering within each partition before any communication with other partitions\nis needed.\n  We evaluate TeraHAC on a number of real-world and synthetic graphs of up to 8\ntrillion edges. We show that TeraHAC requires over 100x fewer rounds compared\nto previously known approaches for computing HAC. It is up to 8.3x faster than\nSCC, the state-of-the-art distributed algorithm for hierarchical clustering,\nwhile achieving 1.16x higher quality. In fact, TeraHAC essentially retains the\nquality of the celebrated HAC algorithm while significantly improving the\nrunning time.\n","authors":["Laxman Dhulipala","Jason Lee","Jakub Łącki","Vahab Mirrokni"],"pdf_url":"https://arxiv.org/pdf/2308.03578v1.pdf","comment":"To appear at SIGMOD 2024"},{"id":"http://arxiv.org/abs/2308.03563v1","updated":"2023-08-07T13:15:33Z","published":"2023-08-07T13:15:33Z","title":"Global cognitive graph properties dynamics of hippocampal formation","summary":"  In the present study we have used a set of methods and metrics to build a\ngraph of relative neural connections in a hippocampus of a rodent. A set of\ngraphs was built on top of time-sequenced data and analyzed in terms of\ndynamics of a connection genesis. The analysis has shown that during the\nprocess of a rodent exploring a novel environment, the relations between\nneurons constantly change which indicates that globally memory is constantly\nupdated even for known areas of space. Even if some neurons gain cognitive\nspecialization, the global network though remains relatively stable.\nAdditionally we suggest a set of methods for building a graph of cognitive\nneural network.\n","authors":["Konstantin Sorokin","Andrey Zaitsew","Aleksandr Levin","German Magai","Maxim Beketov","Vladimir Sotskov"],"pdf_url":"https://arxiv.org/pdf/2308.03563v1.pdf","comment":"12 pages, 6 figures, paper for DAMDID 2023 Conference"},{"id":"http://arxiv.org/abs/2308.03470v1","updated":"2023-08-07T10:56:57Z","published":"2023-08-07T10:56:57Z","title":"Uncertainty-aware Consistency Learning for Cold-Start Item\n  Recommendation","summary":"  Graph Neural Network (GNN)-based models have become the mainstream approach\nfor recommender systems. Despite the effectiveness, they are still suffering\nfrom the cold-start problem, i.e., recommend for few-interaction items.\nExisting GNN-based recommendation models to address the cold-start problem\nmainly focus on utilizing auxiliary features of users and items, leaving the\nuser-item interactions under-utilized. However, embeddings distributions of\ncold and warm items are still largely different, since cold items' embeddings\nare learned from lower-popularity interactions, while warm items' embeddings\nare from higher-popularity interactions. Thus, there is a seesaw phenomenon,\nwhere the recommendation performance for the cold and warm items cannot be\nimproved simultaneously. To this end, we proposed a Uncertainty-aware\nConsistency learning framework for Cold-start item recommendation (shorten as\nUCC) solely based on user-item interactions. Under this framework, we train the\nteacher model (generator) and student model (recommender) with consistency\nlearning, to ensure the cold items with additionally generated low-uncertainty\ninteractions can have similar distribution with the warm items. Therefore, the\nproposed framework improves the recommendation of cold and warm items at the\nsame time, without hurting any one of them. Extensive experiments on benchmark\ndatasets demonstrate that our proposed method significantly outperforms\nstate-of-the-art methods on both warm and cold items, with an average\nperformance improvement of 27.6%.\n","authors":["Taichi Liu","Chen Gao","Zhenyu Wang","Dong Li","Jianye Hao","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.03470v1.pdf","comment":"Accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2308.03443v1","updated":"2023-08-07T10:00:07Z","published":"2023-08-07T10:00:07Z","title":"Doubly Robust Estimator for Off-Policy Evaluation with Large Action\n  Spaces","summary":"  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large\naction spaces. The benchmark estimators suffer from severe bias and variance\ntradeoffs. Parametric approaches suffer from bias due to difficulty specifying\nthe correct model, whereas ones with importance weight suffer from variance. To\novercome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was\nproposed to mitigate the estimator's variance via embeddings of an action. To\nmake the estimator more accurate, we propose the doubly robust estimator of\nMIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical\nanalysis shows that the proposed estimator is unbiased under weaker assumptions\nthan MIPS while maintaining variance reduction against IPS, which was the main\nadvantage of MIPS. The empirical experiment verifies the supremacy of MDR\nagainst existing estimators.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03443v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.03400v1","updated":"2023-08-07T08:38:15Z","published":"2023-08-07T08:38:15Z","title":"Hierarchical Contrastive Learning with Multiple Augmentation for\n  Sequential Recommendation","summary":"  Sequential recommendation addresses the issue of preference drift by\npredicting the next item based on the user's previous behaviors. Recently, a\npromising approach using contrastive learning has emerged, demonstrating its\neffectiveness in recommending items under sparse user-item interactions.\nSignificantly, the effectiveness of combinations of various augmentation\nmethods has been demonstrated in different domains, particularly in computer\nvision. However, when it comes to augmentation within a contrastive learning\nframework in sequential recommendation, previous research has only focused on\nlimited conditions and simple structures. Thus, it is still possible to extend\nexisting approaches to boost the effects of augmentation methods by using\nprogressed structures with the combinations of multiple augmentation methods.\nIn this work, we propose a novel framework called Hierarchical Contrastive\nLearning with Multiple Augmentation for Sequential Recommendation(HCLRec) to\novercome the aforementioned limitation. Our framework leverages existing\naugmentation methods hierarchically to improve performance. By combining\naugmentation methods continuously, we generate low-level and high-level view\npairs. We employ a Transformers-based model to encode the input sequence\neffectively. Furthermore, we introduce additional blocks consisting of\nTransformers and position-wise feed-forward network(PFFN) layers to learn the\ninvariance of the original sequences from hierarchically augmented views. We\npass the input sequence to subsequent layers based on the number of increment\nlevels applied to the views to handle various augmentation levels. Within each\nlayer, we compute contrastive loss between pairs of views at the same level.\nExtensive experiments demonstrate that our proposed method outperforms\nstate-of-the-art approaches and that HCLRec is robust even when faced with the\nproblem of sparse interaction.\n","authors":["Dongjun Lee","Donggeun Ko","Jaekwang Kim"],"pdf_url":"https://arxiv.org/pdf/2308.03400v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.03366v1","updated":"2023-08-07T07:41:01Z","published":"2023-08-07T07:41:01Z","title":"POSIT: Promotion of Semantic Item Tail via Adversarial Learning","summary":"  In many recommender problems, a handful of popular items (e.g. movies/TV\nshows, news etc.) can be dominant in recommendations for many users. However,\nwe know that in a large catalog of items, users are likely interested in more\nthan what is popular. The dominance of popular items may mean that users will\nnot see items they would likely enjoy. In this paper, we propose a technique to\novercome this problem using adversarial machine learning. We define a metric to\ntranslate user-level utility metric in terms of an advantage/disadvantage over\nitems. We subsequently use that metric in an adversarial learning framework to\nsystematically promote disadvantaged items. The resulting algorithm identifies\nsemantically meaningful items that get promoted in the learning algorithm. In\nthe empirical study, we evaluate the proposed technique on three publicly\navailable datasets and four competitive baselines. The result shows that our\nproposed method not only improves the coverage, but also, surprisingly,\nimproves the overall performance.\n","authors":["Qiuling Xu","Pannaga Shivaswamy","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03366v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.03333v1","updated":"2023-08-07T06:29:20Z","published":"2023-08-07T06:29:20Z","title":"Heterogeneous Knowledge Fusion: A Novel Approach for Personalized\n  Recommendation via LLM","summary":"  The analysis and mining of user heterogeneous behavior are of paramount\nimportance in recommendation systems. However, the conventional approach of\nincorporating various types of heterogeneous behavior into recommendation\nmodels leads to feature sparsity and knowledge fragmentation issues. To address\nthis challenge, we propose a novel approach for personalized recommendation via\nLarge Language Model (LLM), by extracting and fusing heterogeneous knowledge\nfrom user heterogeneous behavior information. In addition, by combining\nheterogeneous knowledge and recommendation tasks, instruction tuning is\nperformed on LLM for personalized recommendations. The experimental results\ndemonstrate that our method can effectively integrate user heterogeneous\nbehavior and significantly improve recommendation performance.\n","authors":["Bin Yin","Junjie Xie","Yu Qin","Zixiang Ding","Zhichao Feng","Xiang Li","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03869v1","updated":"2023-08-07T18:40:13Z","published":"2023-08-07T18:40:13Z","title":"Semantic Equivalence of e-Commerce Queries","summary":"  Search query variation poses a challenge in e-commerce search, as equivalent\nsearch intents can be expressed through different queries with surface-level\ndifferences. This paper introduces a framework to recognize and leverage query\nequivalence to enhance searcher and business outcomes. The proposed approach\naddresses three key problems: mapping queries to vector representations of\nsearch intent, identifying nearest neighbor queries expressing equivalent or\nsimilar intent, and optimizing for user or business objectives. The framework\nutilizes both surface similarity and behavioral similarity to determine query\nequivalence. Surface similarity involves canonicalizing queries based on word\ninflection, word order, compounding, and noise words. Behavioral similarity\nleverages historical search behavior to generate vector representations of\nquery intent. An offline process is used to train a sentence similarity model,\nwhile an online nearest neighbor approach supports processing of unseen\nqueries. Experimental evaluations demonstrate the effectiveness of the proposed\napproach, outperforming popular sentence transformer models and achieving a\nPearson correlation of 0.85 for query similarity. The results highlight the\npotential of leveraging historical behavior data and training models to\nrecognize and utilize query equivalence in e-commerce search, leading to\nimproved user experiences and business outcomes. Further advancements and\nbenchmark datasets are encouraged to facilitate the development of solutions\nfor this critical problem in the e-commerce domain.\n","authors":["Aritra Mandal","Daniel Tunkelang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03869v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP"},{"id":"http://arxiv.org/abs/2308.03855v1","updated":"2023-08-07T18:06:46Z","published":"2023-08-07T18:06:46Z","title":"Mobile Supply: The Last Piece of Jigsaw of Recommender System","summary":"  Recommendation system is a fundamental functionality of online platforms.\nWith the development of computing power of mobile phones, some researchers have\ndeployed recommendation algorithms on users' devices to solve the problems of\ndata transmission delay and pagination mechanism. However, the existing\nedge-side mobile rankings cannot completely solve the problem of pagination\nmechanism. The mobile rankings can only sort the items on the current page, so\nit will not work if it is called once or twice. Besides, after the user has\nviewed the items of interest to the user on the current page, the user refresh\nto get a new page of items. This will make the mobile ranking model do a lot of\nuseless work and affect the user's immersive experience. In order to solve the\npagination mechanism problem, we propose a completely new module in the\npipeline of recommender named Mobile Supply. The pipeline of recommender system\nis extended to \"retrival->pre-ranking->ranking->re-ranking->Mobile\nSupply->mobile ranking\". Specifically, we introduce the concept of list value\nand use point-wise method to approximate list-wise estimation. We also design a\nnew mobile ranking named device-aware mobile ranking considering the difference\nof mobile devices tailored to the new pipeline. Extensive offline and online\nexperiments show the superiority of our proposed method and prove that Mobile\nSupply can further improve the performance of edge-side recommender system and\nuser experience. Mobile Supply has been deployed on the homepage page of a\nlarge-scale online food platform and has yielded considerable profits in our\nbusiness.\n","authors":["Zhenhao Jiang","Biao Zeng","Hao Feng","Jin Liu","Jie Zhang","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.03855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03842v1","updated":"2023-08-07T18:00:04Z","published":"2023-08-07T18:00:04Z","title":"Search Engine and Recommendation System for the Music Industry built\n  with JinaAI","summary":"  One of the most intriguing debates regarding a novel task is the development\nof search engines and recommendation-based systems in the music industry.\nStudies have shown a drastic depression in the search engine fields, due to\nconcerning factors such as speed, accuracy and the format of data given for\nquerying. Often people face difficulty in searching for a song solely based on\nthe title, hence a solution is proposed to complete a search analysis through a\nsingle query input and is matched with the lyrics of the songs present in the\ndatabase. Hence it is essential to incorporate cutting-edge technology tools\nfor developing a user-friendly search engine. Jina AI is an MLOps framework for\nbuilding neural search engines that are utilized, in order for the user to\nobtain accurate results. Jina AI effectively helps to maintain and enhance the\nquality of performance for the search engine for the query given. An effective\nsearch engine and a recommendation system for the music industry, built with\nJinaAI.\n","authors":["Ishita Gopalakrishnan","Sanjjushri Varshini R","Ponshriharini V"],"pdf_url":"https://arxiv.org/pdf/2308.03842v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2302.07181v2","updated":"2023-08-07T17:59:16Z","published":"2023-02-14T16:49:25Z","title":"Quantum algorithms applied to satellite mission planning for Earth\n  observation","summary":"  Earth imaging satellites are a crucial part of our everyday lives that enable\nglobal tracking of industrial activities. Use cases span many applications,\nfrom weather forecasting to digital maps, carbon footprint tracking, and\nvegetation monitoring. However, there are limitations; satellites are difficult\nto manufacture, expensive to maintain, and tricky to launch into orbit.\nTherefore, satellites must be employed efficiently. This poses a challenge\nknown as the satellite mission planning problem, which could be computationally\nprohibitive to solve on large scales. However, close-to-optimal algorithms,\nsuch as greedy reinforcement learning and optimization algorithms, can often\nprovide satisfactory resolutions. This paper introduces a set of quantum\nalgorithms to solve the mission planning problem and demonstrate an advantage\nover the classical algorithms implemented thus far. The problem is formulated\nas maximizing the number of high-priority tasks completed on real datasets\ncontaining thousands of tasks and multiple satellites. This work demonstrates\nthat through solution-chaining and clustering, optimization and machine\nlearning algorithms offer the greatest potential for optimal solutions. This\npaper notably illustrates that a hybridized quantum-enhanced reinforcement\nlearning agent can achieve a completion percentage of 98.5% over high-priority\ntasks, significantly improving over the baseline greedy methods with a\ncompletion rate of 75.8%. The results presented in this work pave the way to\nquantum-enabled solutions in the space industry and, more generally, future\nmission planning problems across industries.\n","authors":["Serge Rainjonneau","Igor Tokarev","Sergei Iudin","Saaketh Rayaprolu","Karan Pinto","Daria Lemtiuzhnikova","Miras Koblan","Egor Barashov","Mo Kordzanganeh","Markus Pflitsch","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2302.07181v2.pdf","comment":"13 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2211.09027v3","updated":"2023-08-07T17:56:54Z","published":"2022-11-12T10:12:17Z","title":"LLEDA -- Lifelong Self-Supervised Domain Adaptation","summary":"  Humans and animals have the ability to continuously learn new information\nover their lifetime without losing previously acquired knowledge. However,\nartificial neural networks struggle with this due to new information\nconflicting with old knowledge, resulting in catastrophic forgetting. The\ncomplementary learning systems (CLS) theory suggests that the interplay between\nhippocampus and neocortex systems enables long-term and efficient learning in\nthe mammalian brain, with memory replay facilitating the interaction between\nthese two systems to reduce forgetting. The proposed Lifelong Self-Supervised\nDomain Adaptation (LLEDA) framework draws inspiration from the CLS theory and\nmimics the interaction between two networks: a DA network inspired by the\nhippocampus that quickly adjusts to changes in data distribution and an SSL\nnetwork inspired by the neocortex that gradually learns domain-agnostic general\nrepresentations. LLEDA's latent replay technique facilitates communication\nbetween these two networks by reactivating and replaying the past memory latent\nrepresentations to stabilise long-term generalisation and retention without\ninterfering with the previously learned information. Extensive experiments\ndemonstrate that the proposed method outperforms several other methods\nresulting in a long-term adaptation while being less prone to catastrophic\nforgetting when transferred to new domains.\n","authors":["Mamatha Thota","Dewei Yi","Georgios Leontidis"],"pdf_url":"https://arxiv.org/pdf/2211.09027v3.pdf","comment":"19 pages, 6 figures, 6 tables; V2 added more experiments on more\n  domains and fixed typos"},{"id":"http://arxiv.org/abs/2308.01390v2","updated":"2023-08-07T17:53:09Z","published":"2023-08-02T19:10:23Z","title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive\n  Vision-Language Models","summary":"  We introduce OpenFlamingo, a family of autoregressive vision-language models\nranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce\nan open-source replication of DeepMind's Flamingo models. On seven\nvision-language datasets, OpenFlamingo models average between 80 - 89% of\ncorresponding Flamingo performance. This technical report describes our models,\ntraining data, hyperparameters, and evaluation suite. We share our models and\ncode at https://github.com/mlfoundations/open_flamingo.\n","authors":["Anas Awadalla","Irena Gao","Josh Gardner","Jack Hessel","Yusuf Hanafy","Wanrong Zhu","Kalyani Marathe","Yonatan Bitton","Samir Gadre","Shiori Sagawa","Jenia Jitsev","Simon Kornblith","Pang Wei Koh","Gabriel Ilharco","Mitchell Wortsman","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.01390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03743v1","updated":"2023-08-07T17:51:09Z","published":"2023-08-07T17:51:09Z","title":"The Copycat Perceptron: Smashing Barriers Through Collective Learning","summary":"  We characterize the equilibrium properties of a model of $y$ coupled binary\nperceptrons in the teacher-student scenario, subject to a suitable learning\nrule, with an explicit ferromagnetic coupling proportional to the Hamming\ndistance between the students' weights. In contrast to recent works, we analyze\na more general setting in which a thermal noise is present that affects the\ngeneralization performance of each student. Specifically, in the presence of a\nnonzero temperature, which assigns nonzero probability to configurations that\nmisclassify samples with respect to the teacher's prescription, we find that\nthe coupling of replicas leads to a shift of the phase diagram to smaller\nvalues of $\\alpha$: This suggests that the free energy landscape gets smoother\naround the solution with good generalization (i.e., the teacher) at a fixed\nfraction of reviewed examples, which allows local update algorithms such as\nSimulated Annealing to reach the solution before the dynamics gets frozen.\nFinally, from a learning perspective, these results suggest that more students\n(in this case, with the same amount of data) are able to learn the same rule\nwhen coupled together with a smaller amount of data.\n","authors":["Giovanni Catania","Aurélien Decelle","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2308.03743v1.pdf","comment":"4 figures"},{"id":"http://arxiv.org/abs/2212.09597v6","updated":"2023-08-07T17:50:52Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":"  Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v6.pdf","comment":"ACL 2023, fixed Equation 2"},{"id":"http://arxiv.org/abs/2301.09656v3","updated":"2023-08-07T17:40:40Z","published":"2023-01-23T19:00:02Z","title":"Selective Explanations: Leveraging Human Input to Align Explainable AI","summary":"  While a vast collection of explainable AI (XAI) algorithms have been\ndeveloped in recent years, they are often criticized for significant gaps with\nhow humans produce and consume explanations. As a result, current XAI\ntechniques are often found to be hard to use and lack effectiveness. In this\nwork, we attempt to close these gaps by making AI explanations selective -- a\nfundamental property of human explanations -- by selectively presenting a\nsubset from a large set of model reasons based on what aligns with the\nrecipient's preferences. We propose a general framework for generating\nselective explanations by leveraging human input on a small sample. This\nframework opens up a rich design space that accounts for different selectivity\ngoals, types of input, and more. As a showcase, we use a decision-support task\nto explore selective explanations based on what the decision-maker would\nconsider relevant to the decision task. We conducted two experimental studies\nto examine three out of a broader possible set of paradigms based on our\nproposed framework: in Study 1, we ask the participants to provide their own\ninput to generate selective explanations, with either open-ended or\ncritique-based input. In Study 2, we show participants selective explanations\nbased on input from a panel of similar users (annotators). Our experiments\ndemonstrate the promise of selective explanations in reducing over-reliance on\nAI and improving decision outcomes and subjective perceptions of the AI, but\nalso paint a nuanced picture that attributes some of these positive effects to\nthe opportunity to provide one's own input to augment AI explanations. Overall,\nour work proposes a novel XAI framework inspired by human communication\nbehaviors and demonstrates its potentials to encourage future work to better\nalign AI explanations with human production and consumption of explanations.\n","authors":["Vivian Lai","Yiming Zhang","Chacha Chen","Q. Vera Liao","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2301.09656v3.pdf","comment":"21 pages, 25 figures"},{"id":"http://arxiv.org/abs/2308.03735v1","updated":"2023-08-07T17:34:58Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v1.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2308.03730v1","updated":"2023-08-07T17:18:37Z","published":"2023-08-07T17:18:37Z","title":"SurvBeX: An explanation method of the machine learning survival models\n  based on the Beran estimator","summary":"  An explanation method called SurvBeX is proposed to interpret predictions of\nthe machine learning survival black-box models. The main idea behind the method\nis to use the modified Beran estimator as the surrogate explanation model.\nCoefficients, incorporated into Beran estimator, can be regarded as values of\nthe feature impacts on the black-box model prediction. Following the well-known\nLIME method, many points are generated in a local area around an example of\ninterest. For every generated example, the survival function of the black-box\nmodel is computed, and the survival function of the surrogate model (the Beran\nestimator) is constructed as a function of the explanation coefficients. In\norder to find the explanation coefficients, it is proposed to minimize the mean\ndistance between the survival functions of the black-box model and the Beran\nestimator produced by the generated examples. Many numerical experiments with\nsynthetic and real survival data demonstrate the SurvBeX efficiency and compare\nthe method with the well-known method SurvLIME. The method is also compared\nwith the method SurvSHAP. The code implementing SurvBeX is available at:\nhttps://github.com/DanilaEremenko/SurvBeX\n","authors":["Lev V. Utkin","Danila Y. Eremenko","Andrei V. Konstantinov"],"pdf_url":"https://arxiv.org/pdf/2308.03730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08049v9","updated":"2023-08-07T17:09:10Z","published":"2022-12-15T18:55:23Z","title":"Sliced Optimal Partial Transport","summary":"  Optimal transport (OT) has become exceedingly popular in machine learning,\ndata science, and computer vision. The core assumption in the OT problem is the\nequal total amount of mass in source and target measures, which limits its\napplication. Optimal Partial Transport (OPT) is a recently proposed solution to\nthis limitation. Similar to the OT problem, the computation of OPT relies on\nsolving a linear programming problem (often in high dimensions), which can\nbecome computationally prohibitive. In this paper, we propose an efficient\nalgorithm for calculating the OPT problem between two non-negative measures in\none dimension. Next, following the idea of sliced OT distances, we utilize\nslicing to define the sliced OPT distance. Finally, we demonstrate the\ncomputational and accuracy benefits of the sliced OPT-based method in various\nnumerical experiments. In particular, we show an application of our proposed\nSliced-OPT in noisy point cloud registration.\n","authors":["Yikun Bai","Berhnard Schmitzer","Mathew Thorpe","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2212.08049v9.pdf","comment":"modify the link of Github page"},{"id":"http://arxiv.org/abs/2307.14361v2","updated":"2023-08-07T17:09:07Z","published":"2023-07-24T21:01:46Z","title":"A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer\n  using LSTM, BiLSTM, CNN, GRU, and GloVe","summary":"  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and\nGloVe to classify gene mutations using Kaggle's Personalized Medicine:\nRedefining Cancer Treatment dataset. The results were compared against\nwell-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and\ntheir LSTM ensembles. Our model outperformed all other models in terms of\naccuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it\nalso needed less training time, resulting in a perfect combination of\nperformance and efficiency. This study demonstrates the utility of ensemble\nmodels for difficult tasks such as gene mutation classification.\n","authors":["Sanad Aburass","Osama Dorgham","Jamil Al Shaqsi"],"pdf_url":"https://arxiv.org/pdf/2307.14361v2.pdf","comment":"6 pages, 7 figures and 2 tables"},{"id":"http://arxiv.org/abs/2308.01157v2","updated":"2023-08-07T17:06:56Z","published":"2023-08-02T13:59:35Z","title":"LLMs Understand Glass-Box Models, Discover Surprises, and Suggest\n  Repairs","summary":"  We show that large language models (LLMs) are remarkably good at working with\ninterpretable models that decompose complex outcomes into univariate\ngraph-represented components. By adopting a hierarchical approach to reasoning,\nLLMs can provide comprehensive model-level summaries without ever requiring the\nentire model to fit in context. This approach enables LLMs to apply their\nextensive background knowledge to automate common tasks in data science such as\ndetecting anomalies that contradict prior knowledge, describing potential\nreasons for the anomalies, and suggesting repairs that would remove the\nanomalies. We use multiple examples in healthcare to demonstrate the utility of\nthese new capabilities of LLMs, with particular emphasis on Generalized\nAdditive Models (GAMs). Finally, we present the package $\\texttt{TalkToEBM}$ as\nan open-source LLM-GAM interface.\n","authors":["Benjamin J. Lengerich","Sebastian Bordt","Harsha Nori","Mark E. Nunnally","Yin Aphinyanaphongs","Manolis Kellis","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2308.01157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03723v1","updated":"2023-08-07T16:58:48Z","published":"2023-08-07T16:58:48Z","title":"Dimensionality Reduction for Improving Out-of-Distribution Detection in\n  Medical Image Segmentation","summary":"  Clinically deployed segmentation models are known to fail on data outside of\ntheir training distribution. As these models perform well on most cases, it is\nimperative to detect out-of-distribution (OOD) images at inference to protect\nagainst automation bias. This work applies the Mahalanobis distance post hoc to\nthe bottleneck features of a Swin UNETR model that segments the liver on\nT1-weighted magnetic resonance imaging. By reducing the dimensions of the\nbottleneck features with principal component analysis, OOD images were detected\nwith high performance and minimal computational load.\n","authors":["McKell Woodland","Nihil Patel","Mais Al Taie","Joshua P. Yung","Tucker J. Netherton","Ankit B. Patel","Kristy K. Brock"],"pdf_url":"https://arxiv.org/pdf/2308.03723v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n  improvements or corrections. The Version of Record of this contribution will\n  be published in the Proceedings of Uncertainty for Safe Utilization of\n  Machine Learning in Medical Imaging (5th International Workshop) - Held in\n  conjunction with MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.02029v2","updated":"2023-08-07T16:36:59Z","published":"2023-08-03T20:45:11Z","title":"Deep Maxout Network-based Feature Fusion and Political Tangent Search\n  Optimizer enabled Transfer Learning for Thalassemia Detection","summary":"  Thalassemia is a heritable blood disorder which is the outcome of a genetic\ndefect causing lack of production of hemoglobin polypeptide chains. However,\nthere is less understanding of the precise frequency as well as sharing in\nthese areas. Knowing about the frequency of thalassemia occurrence and\ndependable mutations is thus a significant step in preventing, controlling, and\ntreatment planning. Here, Political Tangent Search Optimizer based Transfer\nLearning (PTSO_TL) is introduced for thalassemia detection. Initially, input\ndata obtained from a particular dataset is normalized in the data normalization\nstage. Quantile normalization is utilized in the data normalization stage, and\nthe data are then passed to the feature fusion phase, in which Weighted\nEuclidean Distance with Deep Maxout Network (DMN) is utilized. Thereafter, data\naugmentation is performed using the oversampling method to increase data\ndimensionality. Lastly, thalassemia detection is carried out by TL, wherein a\nconvolutional neural network (CNN) is utilized with hyperparameters from a\ntrained model such as Xception. TL is tuned by PTSO, and the training algorithm\nPTSO is presented by merging of Political Optimizer (PO) and Tangent Search\nAlgorithm (TSA). Furthermore, PTSO_TL obtained maximal precision, recall, and\nf-measure values of about 94.3%, 96.1%, and 95.2%, respectively.\n","authors":["Hemn Barzan Abdalla","Awder Ahmed","Guoquan Li","Nasser Mustafa","Abdur Rashid Sangi"],"pdf_url":"https://arxiv.org/pdf/2308.02029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03713v1","updated":"2023-08-07T16:32:14Z","published":"2023-08-07T16:32:14Z","title":"Communication-Efficient Framework for Distributed Image Semantic\n  Wireless Transmission","summary":"  Multi-node communication, which refers to the interaction among multiple\ndevices, has attracted lots of attention in many Internet-of-Things (IoT)\nscenarios. However, its huge amounts of data flows and inflexibility for task\nextension have triggered the urgent requirement of communication-efficient\ndistributed data transmission frameworks. In this paper, inspired by the great\nsuperiorities on bandwidth reduction and task adaptation of semantic\ncommunications, we propose a federated learning-based semantic communication\n(FLSC) framework for multi-task distributed image transmission with IoT\ndevices. Federated learning enables the design of independent semantic\ncommunication link of each user while further improves the semantic extraction\nand task performance through global aggregation. Each link in FLSC is composed\nof a hierarchical vision transformer (HVT)-based extractor and a task-adaptive\ntranslator for coarse-to-fine semantic extraction and meaning translation\naccording to specific tasks. In order to extend the FLSC into more realistic\nconditions, we design a channel state information-based multiple-input\nmultiple-output transmission module to combat channel fading and noise.\nSimulation results show that the coarse semantic information can deal with a\nrange of image-level tasks. Moreover, especially in low signal-to-noise ratio\nand channel bandwidth ratio regimes, FLSC evidently outperforms the traditional\nscheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel\ncondition.\n","authors":["Bingyan Xie","Yongpeng Wu","Yuxuan Shi","Derrick Wing Kwan Ng","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03713v1.pdf","comment":"This paper has been accepted by IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2308.03712v1","updated":"2023-08-07T16:31:38Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n  capacity with human-like visual experience","summary":"  This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach human-level\naccuracy on ImageNet. Human-level competence is thus achievable for a\nfundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v1.pdf","comment":"7 pages, 3 figures, 2 tables; code & models available from\n  https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2308.03704v1","updated":"2023-08-07T16:22:59Z","published":"2023-08-07T16:22:59Z","title":"DeRisk: An Effective Deep Learning Framework for Credit Risk Prediction\n  over Real-World Financial Data","summary":"  Despite the tremendous advances achieved over the past years by deep learning\ntechniques, the latest risk prediction models for industrial applications still\nrely on highly handtuned stage-wised statistical learning tools, such as\ngradient boosting and random forest methods. Different from images or\nlanguages, real-world financial data are high-dimensional, sparse, noisy and\nextremely imbalanced, which makes deep neural network models particularly\nchallenging to train and fragile in practice. In this work, we propose DeRisk,\nan effective deep learning risk prediction framework for credit risk prediction\non real-world financial data. DeRisk is the first deep risk prediction model\nthat outperforms statistical learning approaches deployed in our company's\nproduction system. We also perform extensive ablation studies on our method to\npresent the most critical factors for the empirical success of DeRisk.\n","authors":["Yancheng Liang","Jiajie Zhang","Hui Li","Xiaochen Liu","Yi Hu","Yong Wu","Jinyao Zhang","Yongyan Liu","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07448v2","updated":"2023-08-07T16:10:43Z","published":"2023-05-12T13:05:32Z","title":"Deep Deterministic Policy Gradient for End-to-End Communication Systems\n  without Prior Channel Knowledge","summary":"  End-to-End (E2E) learning-based concept has been recently introduced to\njointly optimize both the transmitter and the receiver in wireless\ncommunication systems. Unfortunately, this E2E learning architecture requires a\nprior differentiable channel model to jointly train the deep neural networks\n(DNNs) at the transceivers, which is hardly obtained in practice. This paper\naims to solve this issue by developing a deep deterministic policy gradient\n(DDPG)-based framework. In particular, the proposed solution uses the loss\nvalue of the receiver DNN as the reward to train the transmitter DNN. The\nsimulation results then show that our proposed solution can jointly train the\ntransmitter and the receiver without requiring the prior channel model. In\naddition, we demonstrate that the proposed DDPG-based solution can achieve\nbetter detection performance compared to the state-of-the-art solutions.\n","authors":["Bolun Zhang","Nguyen Van Huynh"],"pdf_url":"https://arxiv.org/pdf/2305.07448v2.pdf","comment":"submitted to IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2308.03688v1","updated":"2023-08-07T16:08:11Z","published":"2023-08-07T16:08:11Z","title":"AgentBench: Evaluating LLMs as Agents","summary":"  Large Language Models (LLMs) are becoming increasingly smart and autonomous,\ntargeting real-world pragmatic missions beyond traditional NLP tasks. As a\nresult, there has been an urgent need to evaluate LLMs as agents on challenging\ntasks in interactive environments. We present AgentBench, a multi-dimensional\nevolving benchmark that currently consists of 8 distinct environments to assess\nLLM-as-Agent's reasoning and decision-making abilities in a multi-turn\nopen-ended generation setting. Our extensive test over 25 LLMs (including APIs\nand open-sourced models) shows that, while top commercial LLMs present a strong\nability of acting as agents in complex environments, there is a significant\ndisparity in performance between them and open-sourced competitors. It also\nserves as a component of an ongoing project with wider coverage and deeper\nconsideration towards systematic LLM evaluation. Datasets, environments, and an\nintegrated evaluation package for AgentBench are released at\nhttps://github.com/THUDM/AgentBench\n","authors":["Xiao Liu","Hao Yu","Hanchen Zhang","Yifan Xu","Xuanyu Lei","Hanyu Lai","Yu Gu","Hangliang Ding","Kaiwen Men","Kejuan Yang","Shudan Zhang","Xiang Deng","Aohan Zeng","Zhengxiao Du","Chenhui Zhang","Sheng Shen","Tianjun Zhang","Yu Su","Huan Sun","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03688v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2308.00086v2","updated":"2023-08-07T16:04:02Z","published":"2023-07-28T10:33:12Z","title":"Unsupervised machine-learning shock-capturing technique for high-order\n  solvers","summary":"  We present a novel unsupervised machine learning shock capturing algorithm\nbased on Gaussian Mixture Models (GMMs). The proposed GMM sensor demonstrates\nremarkable accuracy in detecting shocks and is robust across diverse test cases\nwithout the need for parameter tuning. We compare the GMM-based sensor with\nstate-of-the-art alternatives. All methods are integrated into a high-order\ncompressible discontinuous Galerkin solver where artificial viscosity can be\nmodulated to capture shocks. Supersonic test cases, including high Reynolds\nnumbers, showcase the sensor's performance, demonstrating the same\neffectiveness as fine-tuned state-of-the-art sensors. %The nodal DG aproach\nallows for potential applications in sub-cell flux-differencing formulations,\nsupersonic feature detection, and mesh refinement. The adaptive nature and\nability to function without extensive training datasets make this GMM-based\nsensor suitable for complex geometries and varied flow configurations. Our\nstudy reveals the potential of unsupervised machine learning methods,\nexemplified by the GMM sensor, to improve the robustness and efficiency of\nadvanced CFD codes.\n","authors":["Andrés Mateo-Gabín","Kenza Tlales","Eusebio Valero","Esteban Ferrer","Gonzalo Rubio"],"pdf_url":"https://arxiv.org/pdf/2308.00086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03687v1","updated":"2023-08-07T16:03:40Z","published":"2023-08-07T16:03:40Z","title":"Almost-sure convergence of iterates and multipliers in stochastic\n  sequential quadratic optimization","summary":"  Stochastic sequential quadratic optimization (SQP) methods for solving\ncontinuous optimization problems with nonlinear equality constraints have\nattracted attention recently, such as for solving large-scale data-fitting\nproblems subject to nonconvex constraints. However, for a recently proposed\nsubclass of such methods that is built on the popular stochastic-gradient\nmethodology from the unconstrained setting, convergence guarantees have been\nlimited to the asymptotic convergence of the expected value of a stationarity\nmeasure to zero. This is in contrast to the unconstrained setting in which\nalmost-sure convergence guarantees (of the gradient of the objective to zero)\ncan be proved for stochastic-gradient-based methods. In this paper, new\nalmost-sure convergence guarantees for the primal iterates, Lagrange\nmultipliers, and stationarity measures generated by a stochastic SQP algorithm\nin this subclass of methods are proved. It is shown that the error in the\nLagrange multipliers can be bounded by the distance of the primal iterate to a\nprimal stationary point plus the error in the latest stochastic gradient\nestimate. It is further shown that, subject to certain assumptions, this latter\nerror can be made to vanish by employing a running average of the Lagrange\nmultipliers that are computed during the run of the algorithm. The results of\nnumerical experiments are provided to demonstrate the proved theoretical\nguarantees.\n","authors":["Frank E. Curtis","Xin Jiang","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03686v1","updated":"2023-08-07T16:01:14Z","published":"2023-08-07T16:01:14Z","title":"Linear Convergence Bounds for Diffusion Models via Stochastic\n  Localization","summary":"  Diffusion models are a powerful method for generating approximate samples\nfrom high-dimensional data distributions. Several recent results have provided\npolynomial bounds on the convergence rate of such models, assuming\n$L^2$-accurate score estimators. However, up until now the best known such\nbounds were either superlinear in the data dimension or required strong\nsmoothness assumptions. We provide the first convergence bounds which are\nlinear in the data dimension (up to logarithmic factors) assuming only finite\nsecond moments of the data distribution. We show that diffusion models require\nat most $\\tilde O(\\frac{d \\log^2(1/\\delta)}{\\varepsilon^2})$ steps to\napproximate an arbitrary data distribution on $\\mathbb{R}^d$ corrupted with\nGaussian noise of variance $\\delta$ to within $\\varepsilon^2$ in\nKullback--Leibler divergence. Our proof builds on the Girsanov-based methods of\nprevious works. We introduce a refined treatment of the error arising from the\ndiscretization of the reverse SDE, which is based on tools from stochastic\nlocalization.\n","authors":["Joe Benton","Valentin De Bortoli","Arnaud Doucet","George Deligiannidis"],"pdf_url":"https://arxiv.org/pdf/2308.03686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03670v1","updated":"2023-08-07T15:44:58Z","published":"2023-08-07T15:44:58Z","title":"Improving FHB Screening in Wheat Breeding Using an Efficient Transformer\n  Model","summary":"  Fusarium head blight is a devastating disease that causes significant\neconomic losses annually on small grains. Efficiency, accuracy, and timely\ndetection of FHB in the resistance screening are critical for wheat and barley\nbreeding programs. In recent years, various image processing techniques have\nbeen developed using supervised machine learning algorithms for the early\ndetection of FHB. The state-of-the-art convolutional neural network-based\nmethods, such as U-Net, employ a series of encoding blocks to create a local\nrepresentation and a series of decoding blocks to capture the semantic\nrelations. However, these methods are not often capable of long-range modeling\ndependencies inside the input data, and their ability to model multi-scale\nobjects with significant variations in texture and shape is limited. Vision\ntransformers as alternative architectures with innate global self-attention\nmechanisms for sequence-to-sequence prediction, due to insufficient low-level\ndetails, may also limit localization capabilities. To overcome these\nlimitations, a new Context Bridge is proposed to integrate the local\nrepresentation capability of the U-Net network in the transformer model. In\naddition, the standard attention mechanism of the original transformer is\nreplaced with Efficient Self-attention, which is less complicated than other\nstate-of-the-art methods. To train the proposed network, 12,000 wheat images\nfrom an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were\ncaptured. In addition to healthy and unhealthy plants, these images encompass\nvarious stages of the disease. A team of expert pathologists annotated the\nimages for training and evaluating the developed model. As a result, the\neffectiveness of the transformer-based method for FHB-disease detection,\nthrough extensive experiments across typical tasks for plant image\nsegmentation, is demonstrated.\n","authors":["Babak Azad","Ahmed Abdalla","Kwanghee Won","Ali Mirzakhani Nafchi"],"pdf_url":"https://arxiv.org/pdf/2308.03670v1.pdf","comment":"10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual\n  International Meeting conference in Omaha, Nebraska. Also available at\n  https://elibrary.asabe.org/abstract.asp?aid=54149"},{"id":"http://arxiv.org/abs/2308.03669v1","updated":"2023-08-07T15:40:34Z","published":"2023-08-07T15:40:34Z","title":"Diffusion Model in Causal Inference with Unmeasured Confounders","summary":"  We study how to extend the use of the diffusion model to answer the causal\nquestion from the observational data under the existence of unmeasured\nconfounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to\ncapture the causal intervention, a Diffusion-based Causal Model (DCM) was\nproposed incorporating the diffusion model to answer the causal questions more\naccurately, assuming that all of the confounders are observed. However,\nunmeasured confounders in practice exist, which hinders DCM from being\napplicable. To alleviate this limitation of DCM, we propose an extended model\ncalled Backdoor Criterion based DCM (BDCM), whose idea is rooted in the\nBackdoor criterion to find the variables in DAG to be included in the decoding\nprocess of the diffusion model so that we can extend DCM to the case with\nunmeasured confounders. Synthetic data experiment demonstrates that our\nproposed model captures the counterfactual distribution more precisely than DCM\nunder the unmeasured confounders.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03669v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.05365v6","updated":"2023-08-07T15:39:37Z","published":"2023-04-11T17:20:37Z","title":"Did we personalize? Assessing personalization by an online reinforcement\n  learning algorithm using resampling","summary":"  There is a growing interest in using reinforcement learning (RL) to\npersonalize sequences of treatments in digital health to support users in\nadopting healthier behaviors. Such sequential decision-making problems involve\ndecisions about when to treat and how to treat based on the user's context\n(e.g., prior activity level, location, etc.). Online RL is a promising\ndata-driven approach for this problem as it learns based on each user's\nhistorical responses and uses that knowledge to personalize these decisions.\nHowever, to decide whether the RL algorithm should be included in an\n``optimized'' intervention for real-world deployment, we must assess the data\nevidence indicating that the RL algorithm is actually personalizing the\ntreatments to its users. Due to the stochasticity in the RL algorithm, one may\nget a false impression that it is learning in certain states and using this\nlearning to provide specific treatments. We use a working definition of\npersonalization and introduce a resampling-based methodology for investigating\nwhether the personalization exhibited by the RL algorithm is an artifact of the\nRL algorithm stochasticity. We illustrate our methodology with a case study by\nanalyzing the data from a physical activity clinical trial called HeartSteps,\nwhich included the use of an online RL algorithm. We demonstrate how our\napproach enhances data-driven truth-in-advertising of algorithm personalization\nboth across all users as well as within specific users in the study.\n","authors":["Susobhan Ghosh","Raphael Kim","Prasidh Chhabria","Raaz Dwivedi","Predrag Klasnja","Peng Liao","Kelly Zhang","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2304.05365v6.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2308.03666v1","updated":"2023-08-07T15:35:32Z","published":"2023-08-07T15:35:32Z","title":"Bridging Trustworthiness and Open-World Learning: An Exploratory Neural\n  Approach for Enhancing Interpretability, Generalization, and Robustness","summary":"  As researchers strive to narrow the gap between machine intelligence and\nhuman through the development of artificial intelligence technologies, it is\nimperative that we recognize the critical importance of trustworthiness in\nopen-world, which has become ubiquitous in all aspects of daily life for\neveryone. However, several challenges may create a crisis of trust in current\nartificial intelligence systems that need to be bridged: 1) Insufficient\nexplanation of predictive results; 2) Inadequate generalization for learning\nmodels; 3) Poor adaptability to uncertain environments. Consequently, we\nexplore a neural program to bridge trustworthiness and open-world learning,\nextending from single-modal to multi-modal scenarios for readers. 1) To enhance\ndesign-level interpretability, we first customize trustworthy networks with\nspecific physical meanings; 2) We then design environmental well-being\ntask-interfaces via flexible learning regularizers for improving the\ngeneralization of trustworthy learning; 3) We propose to increase the\nrobustness of trustworthy learning by integrating open-world recognition losses\nwith agent mechanisms. Eventually, we enhance various trustworthy properties\nthrough the establishment of design-level explainability, environmental\nwell-being task-interfaces and open-world recognition programs. These designed\nopen-world protocols are applicable across a wide range of surroundings, under\nopen-world multimedia recognition scenarios with significant performance\nimprovements observed.\n","authors":["Shide Du","Zihan Fang","Shiyang Lan","Yanchao Tan","Manuel Günther","Shiping Wang","Wenzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2308.03666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03664v1","updated":"2023-08-07T15:28:39Z","published":"2023-08-07T15:28:39Z","title":"Two-stage Early Prediction Framework of Remaining Useful Life for\n  Lithium-ion Batteries","summary":"  Early prediction of remaining useful life (RUL) is crucial for effective\nbattery management across various industries, ranging from household appliances\nto large-scale applications. Accurate RUL prediction improves the reliability\nand maintainability of battery technology. However, existing methods have\nlimitations, including assumptions of data from the same sensors or\ndistribution, foreknowledge of the end of life (EOL), and neglect to determine\nthe first prediction cycle (FPC) to identify the start of the unhealthy stage.\nThis paper proposes a novel method for RUL prediction of Lithium-ion batteries.\nThe proposed framework comprises two stages: determining the FPC using a neural\nnetwork-based model to divide the degradation data into distinct health states\nand predicting the degradation pattern after the FPC to estimate the remaining\nuseful life as a percentage. Experimental results demonstrate that the proposed\nmethod outperforms conventional approaches in terms of RUL prediction.\nFurthermore, the proposed method shows promise for real-world scenarios,\nproviding improved accuracy and applicability for battery management.\n","authors":["Dhruv Mittal","Hymalai Bello","Bo Zhou","Mayank Shekhar Jha","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2308.03664v1.pdf","comment":"Accepted at the 49th Annual Conference of the IEEE Industrial\n  Electronics Society (IECON 2023)"},{"id":"http://arxiv.org/abs/2308.03661v1","updated":"2023-08-07T15:24:49Z","published":"2023-08-07T15:24:49Z","title":"Matrix Completion in Almost-Verification Time","summary":"  We give a new framework for solving the fundamental problem of low-rank\nmatrix completion, i.e., approximating a rank-$r$ matrix $\\mathbf{M} \\in\n\\mathbb{R}^{m \\times n}$ (where $m \\ge n$) from random observations. First, we\nprovide an algorithm which completes $\\mathbf{M}$ on $99\\%$ of rows and columns\nunder no further assumptions on $\\mathbf{M}$ from $\\approx mr$ samples and\nusing $\\approx mr^2$ time. Then, assuming the row and column spans of\n$\\mathbf{M}$ satisfy additional regularity properties, we show how to boost\nthis partial completion guarantee to a full matrix completion algorithm by\naggregating solutions to regression problems involving the observations.\n  In the well-studied setting where $\\mathbf{M}$ has incoherent row and column\nspans, our algorithms complete $\\mathbf{M}$ to high precision from\n$mr^{2+o(1)}$ observations in $mr^{3 + o(1)}$ time (omitting logarithmic\nfactors in problem parameters), improving upon the prior state-of-the-art\n[JN15] which used $\\approx mr^5$ samples and $\\approx mr^7$ time. Under an\nassumption on the row and column spans of $\\mathbf{M}$ we introduce (which is\nsatisfied by random subspaces with high probability), our sample complexity\nimproves to an almost information-theoretically optimal $mr^{1 + o(1)}$, and\nour runtime improves to $mr^{2 + o(1)}$. Our runtimes have the appealing\nproperty of matching the best known runtime to verify that a rank-$r$\ndecomposition $\\mathbf{U}\\mathbf{V}^\\top$ agrees with the sampled observations.\nWe also provide robust variants of our algorithms that, given random\nobservations from $\\mathbf{M} + \\mathbf{N}$ with $\\|\\mathbf{N}\\|_{F} \\le\n\\Delta$, complete $\\mathbf{M}$ to Frobenius norm distance $\\approx\nr^{1.5}\\Delta$ in the same runtimes as the noiseless setting. Prior noisy\nmatrix completion algorithms [CP10] only guaranteed a distance of $\\approx\n\\sqrt{n}\\Delta$.\n","authors":["Jonathan A. Kelner","Jerry Li","Allen Liu","Aaron Sidford","Kevin Tian"],"pdf_url":"https://arxiv.org/pdf/2308.03661v1.pdf","comment":"FOCS 2023"},{"id":"http://arxiv.org/abs/2308.03648v1","updated":"2023-08-07T14:58:53Z","published":"2023-08-07T14:58:53Z","title":"Generative Forests","summary":"  Tabular data represents one of the most prevalent form of data. When it comes\nto data generation, many approaches would learn a density for the data\ngeneration process, but would not necessarily end up with a sampler, even less\nso being exact with respect to the underlying density. A second issue is on\nmodels: while complex modeling based on neural nets thrives in image or text\ngeneration (etc.), less is known for powerful generative models on tabular\ndata. A third problem is the visible chasm on tabular data between training\nalgorithms for supervised learning with remarkable properties (e.g. boosting),\nand a comparative lack of guarantees when it comes to data generation. In this\npaper, we tackle the three problems, introducing new tree-based generative\nmodels convenient for density modeling and tabular data generation that improve\non modeling capabilities of recent proposals, and a training algorithm which\nsimplifies the training setting of previous approaches and displays\nboosting-compliant convergence. This algorithm has the convenient property to\nrely on a supervised training scheme that can be implemented by a few tweaks to\nthe most popular induction scheme for decision tree induction with two classes.\nExperiments are provided on missing data imputation and comparing generated\ndata to real data, displaying the quality of the results obtained by our\napproach, in particular against state of the art.\n","authors":["Richard Nock","Mathieu Guillame-Bert"],"pdf_url":"https://arxiv.org/pdf/2308.03648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05174v5","updated":"2023-08-07T14:37:00Z","published":"2022-08-10T06:36:49Z","title":"FedOBD: Opportunistic Block Dropout for Efficiently Training Large-scale\n  Neural Networks through Federated Learning","summary":"  Large-scale neural networks possess considerable expressive power. They are\nwell-suited for complex learning tasks in industrial applications. However,\nlarge-scale models pose significant challenges for training under the current\nFederated Learning (FL) paradigm. Existing approaches for efficient FL training\noften leverage model parameter dropout. However, manipulating individual model\nparameters is not only inefficient in meaningfully reducing the communication\noverhead when training large-scale FL models, but may also be detrimental to\nthe scaling efforts and model performance as shown by recent research. To\naddress these issues, we propose the Federated Opportunistic Block Dropout\n(FedOBD) approach. The key novelty is that it decomposes large-scale models\ninto semantic blocks so that FL participants can opportunistically upload\nquantized blocks, which are deemed to be significant towards training the\nmodel, to the FL server for aggregation. Extensive experiments evaluating\nFedOBD against four state-of-the-art approaches based on multiple real-world\ndatasets show that it reduces the overall communication overhead by more than\n88% compared to the best performing baseline approach, while achieving the\nhighest test accuracy. To the best of our knowledge, FedOBD is the first\napproach to perform dropout on FL models at the block level rather than at the\nindividual parameter level.\n","authors":["Yuanyuan Chen","Zichen Chen","Pengcheng Wu","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2208.05174v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03629v1","updated":"2023-08-07T14:36:03Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v1.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2303.12642v3","updated":"2023-08-07T14:29:03Z","published":"2023-03-22T15:23:22Z","title":"Democratising AI: Multiple Meanings, Goals, and Methods","summary":"  Numerous parties are calling for the democratisation of AI, but the phrase is\nused to refer to a variety of goals, the pursuit of which sometimes conflict.\nThis paper identifies four kinds of AI democratisation that are commonly\ndiscussed: (1) the democratisation of AI use, (2) the democratisation of AI\ndevelopment, (3) the democratisation of AI profits, and (4) the democratisation\nof AI governance. Numerous goals and methods of achieving each form of\ndemocratisation are discussed. The main takeaway from this paper is that AI\ndemocratisation is a multifarious and sometimes conflicting concept that should\nnot be conflated with improving AI accessibility. If we want to move beyond\nambiguous commitments to democratising AI, to productive discussions of\nconcrete policies and trade-offs, then we need to recognise the principal role\nof the democratisation of AI governance in navigating tradeoffs and risks\nacross decisions around use, development, and profits.\n","authors":["Elizabeth Seger","Aviv Ovadya","Ben Garfinkel","Divya Siddarth","Allan Dafoe"],"pdf_url":"https://arxiv.org/pdf/2303.12642v3.pdf","comment":"V2 Changed second author affiliation; added citation to section 5.2;\n  edit to author contribution statement; V3 camera ready version for conference\n  proceedings. Minor content changes in response to reviewer comments"},{"id":"http://arxiv.org/abs/2308.03613v1","updated":"2023-08-07T14:16:52Z","published":"2023-08-07T14:16:52Z","title":"Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous\n  Labels","summary":"  Accurate segmentation of brain vessels is crucial for cerebrovascular disease\ndiagnosis and treatment. However, existing methods face challenges in capturing\nsmall vessels and handling datasets that are partially or ambiguously\nannotated. In this paper, we propose an adaptive semi-supervised approach to\naddress these challenges. Our approach incorporates innovative techniques\nincluding progressive semi-supervised learning, adaptative training strategy,\nand boundary enhancement. Experimental results on 3DRA datasets demonstrate the\nsuperiority of our method in terms of mesh-based segmentation metrics. By\nleveraging the partially and ambiguously labeled data, which only annotates the\nmain vessels, our method achieves impressive segmentation performance on\nmislabeled fine vessels, showcasing its potential for clinical applications.\n","authors":["Fengming Lin","Yan Xia","Nishant Ravikumar","Qiongyao Liu","Michael MacRaild","Alejandro F Frangi"],"pdf_url":"https://arxiv.org/pdf/2308.03613v1.pdf","comment":"Accepted by DALI MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.03574v1","updated":"2023-08-07T13:25:48Z","published":"2023-08-07T13:25:48Z","title":"Generalized Early Stopping in Evolutionary Direct Policy Search","summary":"  Lengthy evaluation times are common in many optimization problems such as\ndirect policy search tasks, especially when they involve conducting evaluations\nin the physical world, e.g. in robotics applications. Often, when evaluating a\nsolution over a fixed time period, it becomes clear that the objective value\nwill not increase with additional computation time (for example, when a\ntwo-wheeled robot continuously spins on the spot). In such cases, it makes\nsense to stop the evaluation early to save computation time. However, most\napproaches to stop the evaluation are problem-specific and need to be\nspecifically designed for the task at hand. Therefore, we propose an early\nstopping method for direct policy search. The proposed method only looks at the\nobjective value at each time step and requires no problem-specific knowledge.\n  We test the introduced stopping criterion in five direct policy search\nenvironments drawn from games, robotics, and classic control domains, and show\nthat it can save up to 75% of the computation time. We also compare it with\nproblem-specific stopping criteria and demonstrate that it performs comparably\nwhile being more generally applicable.\n","authors":["Etor Arza","Leni K. Le Goff","Emma Hart"],"pdf_url":"https://arxiv.org/pdf/2308.03574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03573v1","updated":"2023-08-07T13:24:52Z","published":"2023-08-07T13:24:52Z","title":"When Federated Learning meets Watermarking: A Comprehensive Overview of\n  Techniques for Intellectual Property Protection","summary":"  Federated Learning (FL) is a technique that allows multiple participants to\ncollaboratively train a Deep Neural Network (DNN) without the need of\ncentralizing their data. Among other advantages, it comes with\nprivacy-preserving properties making it attractive for application in sensitive\ncontexts, such as health care or the military. Although the data are not\nexplicitly exchanged, the training procedure requires sharing information about\nparticipants' models. This makes the individual models vulnerable to theft or\nunauthorized distribution by malicious actors. To address the issue of\nownership rights protection in the context of Machine Learning (ML), DNN\nWatermarking methods have been developed during the last five years. Most\nexisting works have focused on watermarking in a centralized manner, but only a\nfew methods have been designed for FL and its unique constraints. In this\npaper, we provide an overview of recent advancements in Federated Learning\nwatermarking, shedding light on the new challenges and opportunities that arise\nin this field.\n","authors":["Mohammed Lansari","Reda Bellafqira","Katarzyna Kapusta","Vincent Thouvenot","Olivier Bettan","Gouenou Coatrieux"],"pdf_url":"https://arxiv.org/pdf/2308.03573v1.pdf","comment":"2figures, 14pages, 3tables"},{"id":"http://arxiv.org/abs/2308.03572v1","updated":"2023-08-07T13:24:50Z","published":"2023-08-07T13:24:50Z","title":"Provably Efficient Learning in Partially Observable Contextual Bandit","summary":"  In this paper, we investigate transfer learning in partially observable\ncontextual bandits, where agents have limited knowledge from other agents and\npartial information about hidden confounders. We first convert the problem to\nidentifying or partially identifying causal effects between actions and rewards\nthrough optimization problems. To solve these optimization problems, we\ndiscretize the original functional constraints of unknown distributions into\nlinear constraints, and sample compatible causal models via sequentially\nsolving linear programmings to obtain causal bounds with the consideration of\nestimation error. Our sampling algorithms provide desirable convergence results\nfor suitable sampling distributions. We then show how causal bounds can be\napplied to improving classical bandit algorithms and affect the regrets with\nrespect to the size of action sets and function spaces. Notably, in the task\nwith function approximation which allows us to handle general context\ndistributions, our method improves the order dependence on function space size\ncompared with previous literatures. We formally prove that our causally\nenhanced algorithms outperform classical bandit algorithms and achieve orders\nof magnitude faster convergence rates. Finally, we perform simulations that\ndemonstrate the efficiency of our strategy compared to the current\nstate-of-the-art methods. This research has the potential to enhance the\nperformance of contextual bandit agents in real-world applications where data\nis scarce and costly to obtain.\n","authors":["Xueping Gong","Jiheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03572v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2010.03104 by other authors"},{"id":"http://arxiv.org/abs/2206.08083v4","updated":"2023-08-07T13:24:06Z","published":"2022-06-16T10:53:18Z","title":"CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation\n  from Simulation to multiple Real-World Domains","summary":"  Unsupervised Domain Adaptation demonstrates great potential to mitigate\ndomain shifts by transferring models from labeled source domains to unlabeled\ntarget domains. While Unsupervised Domain Adaptation has been applied to a wide\nvariety of complex vision tasks, only few works focus on lane detection for\nautonomous driving. This can be attributed to the lack of publicly available\ndatasets. To facilitate research in these directions, we propose CARLANE, a\n3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE\nencompasses the single-target datasets MoLane and TuLane and the multi-target\ndataset MuLane. These datasets are built from three different domains, which\ncover diverse scenes and contain a total of 163K unique images, 118K of which\nare annotated. In addition we evaluate and report systematic baselines,\nincluding our own method, which builds upon Prototypical Cross-domain\nSelf-supervised Learning. We find that false positive and false negative rates\nof the evaluated domain adaptation methods are high compared to those of fully\nsupervised baselines. This affirms the need for benchmarks such as CARLANE to\nfurther strengthen research in Unsupervised Domain Adaptation for lane\ndetection. CARLANE, all evaluated models and the corresponding implementations\nare publicly available at https://carlanebenchmark.github.io.\n","authors":["Julian Gebele","Bonifaz Stuhr","Johann Haselberger"],"pdf_url":"https://arxiv.org/pdf/2206.08083v4.pdf","comment":"36th Conference on Neural Information Processing Systems (NeurIPS\n  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.12375v2","updated":"2023-08-07T13:22:01Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03570v1","updated":"2023-08-07T13:21:58Z","published":"2023-08-07T13:21:58Z","title":"Partial identification of kernel based two sample tests with mismeasured\n  data","summary":"  Nonparametric two-sample tests such as the Maximum Mean Discrepancy (MMD) are\noften used to detect differences between two distributions in machine learning\napplications. However, the majority of existing literature assumes that\nerror-free samples from the two distributions of interest are available.We\nrelax this assumption and study the estimation of the MMD under\n$\\epsilon$-contamination, where a possibly non-random $\\epsilon$ proportion of\none distribution is erroneously grouped with the other. We show that under\n$\\epsilon$-contamination, the typical estimate of the MMD is unreliable.\nInstead, we study partial identification of the MMD, and characterize sharp\nupper and lower bounds that contain the true, unknown MMD. We propose a method\nto estimate these bounds, and show that it gives estimates that converge to the\nsharpest possible bounds on the MMD as sample size increases, with a\nconvergence rate that is faster than alternative approaches. Using three\ndatasets, we empirically validate that our approach is superior to the\nalternatives: it gives tight bounds with a low false coverage rate.\n","authors":["Ron Nafshi","Maggie Makar"],"pdf_url":"https://arxiv.org/pdf/2308.03570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10923v2","updated":"2023-08-07T12:58:57Z","published":"2022-06-22T09:02:42Z","title":"FairGrad: Fairness Aware Gradient Descent","summary":"  We address the problem of group fairness in classification, where the\nobjective is to learn models that do not unjustly discriminate against\nsubgroups of the population. Most existing approaches are limited to simple\nbinary tasks or involve difficult to implement training mechanisms which\nreduces their practical applicability. In this paper, we propose FairGrad, a\nmethod to enforce fairness based on a re-weighting scheme that iteratively\nlearns group specific weights based on whether they are advantaged or not.\nFairGrad is easy to implement, accommodates various standard fairness\ndefinitions, and comes with minimal overhead. Furthermore, we show that it is\ncompetitive with standard baselines over various datasets including ones used\nin natural language processing and computer vision.\n  FairGrad is available as a PyPI package at -\nhttps://pypi.org/project/fairgrad\n","authors":["Gaurav Maheshwari","Michaël Perrot"],"pdf_url":"https://arxiv.org/pdf/2206.10923v2.pdf","comment":"Paper is accepted at Transactions on Machine Learning Research.\n  Reviewed on OpenReview: https://openreview.net/forum?id=0f8tU3QwWD"},{"id":"http://arxiv.org/abs/2308.03542v1","updated":"2023-08-07T12:44:10Z","published":"2023-08-07T12:44:10Z","title":"A Transfer Learning Framework for Proactive Ramp Metering Performance\n  Assessment","summary":"  Transportation agencies need to assess ramp metering performance when\ndeploying or expanding a ramp metering system. The evaluation of a ramp\nmetering strategy is primarily centered around examining its impact on freeway\ntraffic mobility. One way these effects can be explored is by comparing traffic\nstates, such as the speed before and after the ramp metering strategy has been\naltered. Predicting freeway traffic states for the after scenarios following\nthe implementation of a new ramp metering control strategy could offer valuable\ninsights into the potential effectiveness of the target strategy. However, the\nuse of machine learning methods in predicting the freeway traffic state for the\nafter scenarios and evaluating the effectiveness of transportation policies or\ntraffic control strategies such as ramp metering is somewhat limited in the\ncurrent literature. To bridge the research gap, this study presents a framework\nfor predicting freeway traffic parameters (speed, occupancy, and flow rate) for\nthe after situations when a new ramp metering control strategy is implemented.\nBy learning the association between the spatial-temporal features of traffic\nstates in before and after situations for known freeway segments, the proposed\nframework can transfer this learning to predict the traffic parameters for new\nfreeway segments. The proposed framework is built upon a transfer learning\nmodel. Experimental results show that the proposed framework is feasible for\nuse as an alternative for predicting freeway traffic parameters to proactively\nevaluate ramp metering performance.\n","authors":["Xiaobo Ma","Adrian Cottam","Mohammad Razaur Rahman Shaon","Yao-Jan Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03538v1","updated":"2023-08-07T12:36:30Z","published":"2023-08-07T12:36:30Z","title":"On-ramp and Off-ramp Traffic Flows Estimation Based on A Data-driven\n  Transfer Learning Framework","summary":"  To develop the most appropriate control strategy and monitor, maintain, and\nevaluate the traffic performance of the freeway weaving areas, state and local\nDepartments of Transportation need to have access to traffic flows at each pair\nof on-ramp and off-ramp. However, ramp flows are not always readily available\nto transportation agencies and little effort has been made to estimate these\nmissing flows in locations where no physical sensors are installed. To bridge\nthis research gap, a data-driven framework is proposed that can accurately\nestimate the missing ramp flows by solely using data collected from loop\ndetectors on freeway mainlines. The proposed framework employs a transfer\nlearning model. The transfer learning model relaxes the assumption that the\nunderlying data distributions of the source and target domains must be the\nsame. Therefore, the proposed framework can guarantee high-accuracy estimation\nof on-ramp and off-ramp flows on freeways with different traffic patterns,\ndistributions, and characteristics. Based on the experimental results, the flow\nestimation mean absolute errors range between 23.90 veh/h to 40.85 veh/h for\non-ramps, and 31.58 veh/h to 45.31 veh/h for off-ramps; the flow estimation\nroot mean square errors range between 34.55 veh/h to 57.77 veh/h for on-ramps,\nand 41.75 veh/h to 58.80 veh/h for off-ramps. Further, the comparison analysis\nshows that the proposed framework outperforms other conventional machine\nlearning models. The estimated ramp flows based on the proposed method can help\ntransportation agencies to enhance the operations of their ramp control\nstrategies for locations where physical sensors are not installed.\n","authors":["Xiaobo Ma","Abolfazl Karimpour","Yao-Jan Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.11577v4","updated":"2023-08-07T12:33:20Z","published":"2021-08-26T04:42:24Z","title":"Machine Unlearning of Features and Labels","summary":"  Removing information from a machine learning model is a non-trivial task that\nrequires to partially revert the training process. This task is unavoidable\nwhen sensitive data, such as credit card numbers or passwords, accidentally\nenter the model and need to be removed afterwards. Recently, different concepts\nfor machine unlearning have been proposed to address this problem. While these\napproaches are effective in removing individual data points, they do not scale\nto scenarios where larger groups of features and labels need to be reverted. In\nthis paper, we propose the first method for unlearning features and labels. Our\napproach builds on the concept of influence functions and realizes unlearning\nthrough closed-form updates of model parameters. It enables to adapt the\ninfluence of training data on a learning model retrospectively, thereby\ncorrecting data leaks and privacy issues. For learning models with strongly\nconvex loss functions, our method provides certified unlearning with\ntheoretical guarantees. For models with non-convex losses, we empirically show\nthat unlearning features and labels is effective and significantly faster than\nother strategies.\n","authors":["Alexander Warnecke","Lukas Pirch","Christian Wressnegger","Konrad Rieck"],"pdf_url":"https://arxiv.org/pdf/2108.11577v4.pdf","comment":"Network and Distributed System Security Symposium (NDSS) 2023"},{"id":"http://arxiv.org/abs/2308.03530v1","updated":"2023-08-07T12:27:19Z","published":"2023-08-07T12:27:19Z","title":"Deep Feature Learning for Wireless Spectrum Data","summary":"  In recent years, the traditional feature engineering process for training\nmachine learning models is being automated by the feature extraction layers\nintegrated in deep learning architectures. In wireless networks, many studies\nwere conducted in automatic learning of feature representations for\ndomain-related challenges. However, most of the existing works assume some\nsupervision along the learning process by using labels to optimize the model.\nIn this paper, we investigate an approach to learning feature representations\nfor wireless transmission clustering in a completely unsupervised manner, i.e.\nrequiring no labels in the process. We propose a model based on convolutional\nneural networks that automatically learns a reduced dimensionality\nrepresentation of the input data with 99.3% less components compared to a\nbaseline principal component analysis (PCA). We show that the automatic\nrepresentation learning is able to extract fine-grained clusters containing the\nshapes of the wireless transmission bursts, while the baseline enables only\ngeneral separability of the data based on the background noise.\n","authors":["Ljupcho Milosheski","Gregor Cerar","Blaž Bertalanič","Carolina Fortuna","Mihael Mohorčič"],"pdf_url":"https://arxiv.org/pdf/2308.03530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03526v1","updated":"2023-08-07T12:21:37Z","published":"2023-08-07T12:21:37Z","title":"AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning","summary":"  StarCraft II is one of the most challenging simulated reinforcement learning\nenvironments; it is partially observable, stochastic, multi-agent, and\nmastering StarCraft II requires strategic planning over long time horizons with\nreal-time low-level execution. It also has an active professional competitive\nscene. StarCraft II is uniquely suited for advancing offline RL algorithms,\nboth because of its challenging nature and because Blizzard has released a\nmassive dataset of millions of StarCraft II games played by human players. This\npaper leverages that and establishes a benchmark, called AlphaStar Unplugged,\nintroducing unprecedented challenges for offline reinforcement learning. We\ndefine a dataset (a subset of Blizzard's release), tools standardizing an API\nfor machine learning methods, and an evaluation protocol. We also present\nbaseline agents, including behavior cloning, offline variants of actor-critic\nand MuZero. We improve the state of the art of agents using only offline data,\nand we achieve 90% win rate against previously published AlphaStar behavior\ncloning agent.\n","authors":["Michaël Mathieu","Sherjil Ozair","Srivatsan Srinivasan","Caglar Gulcehre","Shangtong Zhang","Ray Jiang","Tom Le Paine","Richard Powell","Konrad Żołna","Julian Schrittwieser","David Choi","Petko Georgiev","Daniel Toyama","Aja Huang","Roman Ring","Igor Babuschkin","Timo Ewalds","Mahyar Bordbar","Sarah Henderson","Sergio Gómez Colmenarejo","Aäron van den Oord","Wojciech Marian Czarnecki","Nando de Freitas","Oriol Vinyals"],"pdf_url":"https://arxiv.org/pdf/2308.03526v1.pdf","comment":"32 pages, 13 figures, previous version published as a NeurIPS 2021\n  workshop: https://openreview.net/forum?id=Np8Pumfoty"},{"id":"http://arxiv.org/abs/2308.03514v1","updated":"2023-08-07T12:10:13Z","published":"2023-08-07T12:10:13Z","title":"Worker Activity Recognition in Manufacturing Line Using Near-body\n  Electric Field","summary":"  Manufacturing industries strive to improve production efficiency and product\nquality by deploying advanced sensing and control systems. Wearable sensors are\nemerging as a promising solution for achieving this goal, as they can provide\ncontinuous and unobtrusive monitoring of workers' activities in the\nmanufacturing line. This paper presents a novel wearable sensing prototype that\ncombines IMU and body capacitance sensing modules to recognize worker\nactivities in the manufacturing line. To handle these multimodal sensor data,\nwe propose and compare early, and late sensor data fusion approaches for\nmulti-channel time-series convolutional neural networks and deep convolutional\nLSTM. We evaluate the proposed hardware and neural network model by collecting\nand annotating sensor data using the proposed sensing prototype and Apple\nWatches in the testbed of the manufacturing line. Experimental results\ndemonstrate that our proposed methods achieve superior performance compared to\nthe baseline methods, indicating the potential of the proposed approach for\nreal-world applications in manufacturing industries. Furthermore, the proposed\nsensing prototype with a body capacitive sensor and feature fusion method\nimproves by 6.35%, yielding a 9.38% higher macro F1 score than the proposed\nsensing prototype without a body capacitive sensor and Apple Watch data,\nrespectively.\n","authors":["Sungho Suh","Vitor Fortes Rey","Sizhen Bian","Yu-Chi Huang","Jože M. Rožanec","Hooman Tavakoli Ghinani","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2308.03514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08674v3","updated":"2023-08-07T12:08:17Z","published":"2023-07-17T17:36:09Z","title":"TableGPT: Towards Unifying Tables, Nature Language and Commands into One\n  GPT","summary":"  Tables are prevalent in real-world databases, requiring significant time and\neffort for humans to analyze and manipulate. The advancements in large language\nmodels (LLMs) have made it possible to interact with tables using natural\nlanguage input, bringing this capability closer to reality. In this paper, we\npresent TableGPT, a unified fine-tuned framework that enables LLMs to\nunderstand and operate on tables using external functional commands. It\nintroduces the capability to seamlessly interact with tables, enabling a wide\nrange of functionalities such as question answering, data manipulation (e.g.,\ninsert, delete, query, and modify operations), data visualization, analysis\nreport generation, and automated prediction. TableGPT aims to provide\nconvenience and accessibility to users by empowering them to effortlessly\nleverage tabular data. At the core of TableGPT lies the novel concept of global\ntabular representations, which empowers LLMs to gain a comprehensive\nunderstanding of the entire table beyond meta-information. By jointly training\nLLMs on both table and text modalities, TableGPT achieves a deep understanding\nof tabular data and the ability to perform complex operations on tables through\nchain-of-command instructions. Importantly, TableGPT offers the advantage of\nbeing a self-contained system rather than relying on external API interfaces.\nMoreover, it supports efficient data process flow, query rejection (when\nappropriate) and private deployment, enabling faster domain data fine-tuning\nand ensuring data privacy, which enhances the framework's adaptability to\nspecific use cases.\n","authors":["Liangyu Zha","Junlin Zhou","Liyao Li","Rui Wang","Qingyi Huang","Saisai Yang","Jing Yuan","Changbao Su","Xiang Li","Aofeng Su","Tao Zhang","Chen Zhou","Kaizhe Shou","Miao Wang","Wufang Zhu","Guoshan Lu","Chao Ye","Yali Ye","Wentao Ye","Yiming Zhang","Xinglong Deng","Jie Xu","Haobo Wang","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.08674v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2302.00025v2","updated":"2023-08-07T12:06:43Z","published":"2023-01-31T19:00:28Z","title":"On the Within-Group Fairness of Screening Classifiers","summary":"  Screening classifiers are increasingly used to identify qualified candidates\nin a variety of selection processes. In this context, it has been recently\nshown that, if a classifier is calibrated, one can identify the smallest set of\ncandidates which contains, in expectation, a desired number of qualified\ncandidates using a threshold decision rule. This lends support to focusing on\ncalibration as the only requirement for screening classifiers. In this paper,\nwe argue that screening policies that use calibrated classifiers may suffer\nfrom an understudied type of within-group unfairness -- they may unfairly treat\nqualified members within demographic groups of interest. Further, we argue that\nthis type of unfairness can be avoided if classifiers satisfy within-group\nmonotonicity, a natural monotonicity property within each of the groups. Then,\nwe introduce an efficient post-processing algorithm based on dynamic\nprogramming to minimally modify a given calibrated classifier so that its\nprobability estimates satisfy within-group monotonicity. We validate our\nalgorithm using US Census survey data and show that within-group monotonicity\ncan be often achieved at a small cost in terms of prediction granularity and\nshortlist size.\n","authors":["Nastaran Okati","Stratis Tsirtsis","Manuel Gomez Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2302.00025v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14311v3","updated":"2023-08-07T12:06:19Z","published":"2023-02-28T05:01:01Z","title":"Towards Memory- and Time-Efficient Backpropagation for Training Spiking\n  Neural Networks","summary":"  Spiking Neural Networks (SNNs) are promising energy-efficient models for\nneuromorphic computing. For training the non-differentiable SNN models, the\nbackpropagation through time (BPTT) with surrogate gradients (SG) method has\nachieved high performance. However, this method suffers from considerable\nmemory cost and training time during training. In this paper, we propose the\nSpatial Learning Through Time (SLTT) method that can achieve high performance\nwhile greatly improving training efficiency compared with BPTT. First, we show\nthat the backpropagation of SNNs through the temporal domain contributes just a\nlittle to the final calculated gradients. Thus, we propose to ignore the\nunimportant routes in the computational graph during backpropagation. The\nproposed method reduces the number of scalar multiplications and achieves a\nsmall memory occupation that is independent of the total time steps.\nFurthermore, we propose a variant of SLTT, called SLTT-K, that allows\nbackpropagation only at K time steps, then the required number of scalar\nmultiplications is further reduced and is independent of the total time steps.\nExperiments on both static and neuromorphic datasets demonstrate superior\ntraining efficiency and performance of our SLTT. In particular, our method\nachieves state-of-the-art accuracy on ImageNet, while the memory cost and\ntraining time are reduced by more than 70% and 50%, respectively, compared with\nBPTT.\n","authors":["Qingyan Meng","Mingqing Xiao","Shen Yan","Yisen Wang","Zhouchen Lin","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2302.14311v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03511v1","updated":"2023-08-07T12:05:55Z","published":"2023-08-07T12:05:55Z","title":"A data-driven approach to predict decision point choice during normal\n  and evacuation wayfinding in multi-story buildings","summary":"  Understanding pedestrian route choice behavior in complex buildings is\nimportant to ensure pedestrian safety. Previous studies have mostly used\ntraditional data collection methods and discrete choice modeling to understand\nthe influence of different factors on pedestrian route and exit choice,\nparticularly in simple indoor environments. However, research on pedestrian\nroute choice in complex buildings is still limited. This paper presents a\ndata-driven approach for understanding and predicting the pedestrian decision\npoint choice during normal and emergency wayfinding in a multi-story building.\nFor this, we first built an indoor network representation and proposed a data\nmapping technique to map VR coordinates to the indoor representation. We then\nused a well-established machine learning algorithm, namely the random forest\n(RF) model to predict pedestrian decision point choice along a route during\nfour wayfinding tasks in a multi-story building. Pedestrian behavioral data in\na multi-story building was collected by a Virtual Reality experiment. The\nresults show a much higher prediction accuracy of decision points using the RF\nmodel (i.e., 93% on average) compared to the logistic regression model. The\nhighest prediction accuracy was 96% for task 3. Additionally, we tested the\nmodel performance combining personal characteristics and we found that personal\ncharacteristics did not affect decision point choice. This paper demonstrates\nthe potential of applying a machine learning algorithm to study pedestrian\nroute choice behavior in complex indoor buildings.\n","authors":["Yan Feng","Panchamy Krishnakumari"],"pdf_url":"https://arxiv.org/pdf/2308.03511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12377v4","updated":"2023-08-07T12:02:40Z","published":"2022-07-25T17:46:09Z","title":"A novel Deep Learning approach for one-step Conformal Prediction\n  approximation","summary":"  Deep Learning predictions with measurable confidence are increasingly\ndesirable for real-world problems, especially in high-risk settings. The\nConformal Prediction (CP) framework is a versatile solution that guarantees a\nmaximum error rate given minimal constraints. In this paper, we propose a novel\nconformal loss function that approximates the traditionally two-step CP\napproach in a single step. By evaluating and penalising deviations from the\nstringent expected CP output distribution, a Deep Learning model may learn the\ndirect relationship between the input data and the conformal p-values. We carry\nout a comprehensive empirical evaluation to show our novel loss function's\ncompetitiveness for seven binary and multi-class prediction tasks on five\nbenchmark datasets. On the same datasets, our approach achieves significant\ntraining time reductions up to 86% compared to Aggregated Conformal Prediction\n(ACP), while maintaining comparable approximate validity and predictive\nefficiency.\n","authors":["Julia A. Meister","Khuong An Nguyen","Stelios Kapetanakis","Zhiyuan Luo"],"pdf_url":"https://arxiv.org/pdf/2207.12377v4.pdf","comment":"34 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.03495v1","updated":"2023-08-07T11:42:50Z","published":"2023-08-07T11:42:50Z","title":"Balanced Face Dataset: Guiding StyleGAN to Generate Labeled Synthetic\n  Face Image Dataset for Underrepresented Group","summary":"  For a machine learning model to generalize effectively to unseen data within\na particular problem domain, it is well-understood that the data needs to be of\nsufficient size and representative of real-world scenarios. Nonetheless,\nreal-world datasets frequently have overrepresented and underrepresented\ngroups. One solution to mitigate bias in machine learning is to leverage a\ndiverse and representative dataset. Training a model on a dataset that covers\nall demographics is crucial to reducing bias in machine learning. However,\ncollecting and labeling large-scale datasets has been challenging, prompting\nthe use of synthetic data generation and active labeling to decrease the costs\nof manual labeling. The focus of this study was to generate a robust face image\ndataset using the StyleGAN model. In order to achieve a balanced distribution\nof the dataset among different demographic groups, a synthetic dataset was\ncreated by controlling the generation process of StyleGaN and annotated for\ndifferent downstream tasks.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2308.03495v1.pdf","comment":"7 pages, 7 figures,submitted to AMLD Africa 2021 conference"},{"id":"http://arxiv.org/abs/2208.00953v2","updated":"2023-08-07T11:18:47Z","published":"2022-08-01T16:05:14Z","title":"Visual Interpretable and Explainable Deep Learning Models for Brain\n  Tumor MRI and COVID-19 Chest X-ray Images","summary":"  Deep learning shows promise for medical image analysis but lacks\ninterpretability, hindering adoption in healthcare. Attribution techniques that\nexplain model reasoning may increase trust in deep learning among clinical\nstakeholders. This paper aimed to evaluate attribution methods for illuminating\nhow deep neural networks analyze medical images. Using adaptive path-based\ngradient integration, we attributed predictions from brain tumor MRI and\nCOVID-19 chest X-ray datasets made by recent deep convolutional neural network\nmodels. The technique highlighted possible biomarkers, exposed model biases,\nand offered insights into the links between input and prediction. Our analysis\ndemonstrates the method's ability to elucidate model reasoning on these\ndatasets. The resulting attributions show promise for improving deep learning\ntransparency for domain experts by revealing the rationale behind predictions.\nThis study advances model interpretability to increase trust in deep learning\namong healthcare stakeholders.\n","authors":["Yusuf Brima","Marcellin Atemkeng"],"pdf_url":"https://arxiv.org/pdf/2208.00953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03345v3","updated":"2023-08-07T11:16:08Z","published":"2023-01-09T13:56:59Z","title":"Latent Spectral Regularization for Continual Learning","summary":"  While biological intelligence grows organically as new knowledge is gathered\nthroughout life, Artificial Neural Networks forget catastrophically whenever\nthey face a changing training data distribution. Rehearsal-based Continual\nLearning (CL) approaches have been established as a versatile and reliable\nsolution to overcome this limitation; however, sudden input disruptions and\nmemory constraints are known to alter the consistency of their predictions. We\nstudy this phenomenon by investigating the geometric characteristics of the\nlearner's latent space and find that replayed data points of different classes\nincreasingly mix up, interfering with classification. Hence, we propose a\ngeometric regularizer that enforces weak requirements on the Laplacian spectrum\nof the latent space, promoting a partitioning behavior. We show that our\nproposal, called Continual Spectral Regularizer (CaSpeR), can be easily\ncombined with any rehearsal-based CL approach and improves the performance of\nSOTA methods on standard benchmarks. Finally, we conduct additional analysis to\nprovide insights into CaSpeR's effects and applicability.\n","authors":["Emanuele Frascaroli","Riccardo Benaglia","Matteo Boschini","Luca Moschella","Cosimo Fiorini","Emanuele Rodolà","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2301.03345v3.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.03476v1","updated":"2023-08-07T11:09:12Z","published":"2023-08-07T11:09:12Z","title":"Exploring the Physical World Adversarial Robustness of Vehicle Detection","summary":"  Adversarial attacks can compromise the robustness of real-world detection\nmodels. However, evaluating these models under real-world conditions poses\nchallenges due to resource-intensive experiments. Virtual simulations offer an\nalternative, but the absence of standardized benchmarks hampers progress.\nAddressing this, we propose an innovative instant-level data generation\npipeline using the CARLA simulator. Through this pipeline, we establish the\nDiscrete and Continuous Instant-level (DCI) dataset, enabling comprehensive\nexperiments involving three detection models and three physical adversarial\nattacks. Our findings highlight diverse model performances under adversarial\nconditions. Yolo v6 demonstrates remarkable resilience, experiencing just a\nmarginal 6.59% average drop in average precision (AP). In contrast, the ASA\nattack yields a substantial 14.51% average AP reduction, twice the effect of\nother algorithms. We also note that static scenes yield higher recognition AP\nvalues, and outcomes remain relatively consistent across varying weather\nconditions. Intriguingly, our study suggests that advancements in adversarial\nattack algorithms may be approaching its ``limitation''.In summary, our work\nunderscores the significance of adversarial attacks in real-world contexts and\nintroduces the DCI dataset as a versatile benchmark. Our findings provide\nvaluable insights for enhancing the robustness of detection models and offer\nguidance for future research endeavors in the realm of adversarial attacks.\n","authors":["Wei Jiang","Tianyuan Zhang","Shuangcheng Liu","Weiyu Ji","Zichao Zhang","Gang Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03472v1","updated":"2023-08-07T11:02:44Z","published":"2023-08-07T11:02:44Z","title":"How to forecast power generation in wind farms? Insights from leveraging\n  hierarchical structure","summary":"  Forecasting of renewable energy generation provides key insights which may\nhelp with decision-making towards global decarbonisation. Renewable energy\ngeneration can often be represented through cross-sectional hierarchies,\nwhereby a single farm may have multiple individual generators. Hierarchical\nforecasting through reconciliation has demonstrated a significant increase in\nthe quality of forecasts both theoretically and empirically. However, it is not\nevident whether forecasts generated by individual temporal and cross-sectional\naggregation can be superior to integrated cross-temporal forecasts and to\nindividual forecasts on more granular data. In this study, we investigate the\naccuracies of different cross-sectional and cross-temporal reconciliation\nmethods using both linear regression and gradient boosting machine learning for\nforecasting wind farm power generation. We found that cross-temporal\nreconciliation is superior to individual cross-sectional reconciliation at\nmultiple temporal aggregations. Cross-temporally reconciled machine learning\nbase forecasts also demonstrated a high accuracy at coarser temporal\ngranularities, which may encourage adoption for short-term wind forecasts. We\nalso show that linear regression can outperform machine learning models across\nmost levels in cross-sectional wind time series.\n","authors":["Lucas English","Mahdi Abolghasemi"],"pdf_url":"https://arxiv.org/pdf/2308.03472v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.08432v2","updated":"2023-08-07T10:58:21Z","published":"2023-06-14T11:02:08Z","title":"Batches Stabilize the Minimum Norm Risk in High Dimensional\n  Overparameterized Linear Regression","summary":"  Learning algorithms that divide the data into batches are prevalent in many\nmachine-learning applications, typically offering useful trade-offs between\ncomputational efficiency and performance. In this paper, we examine the\nbenefits of batch-partitioning through the lens of a minimum-norm\noverparameterized linear regression model with isotropic Gaussian features. We\nsuggest a natural small-batch version of the minimum-norm estimator, and derive\nan upper bound on its quadratic risk, showing it is inversely proportional to\nthe noise level as well as to the overparameterization ratio, for the optimal\nchoice of batch size. In contrast to minimum-norm, our estimator admits a\nstable risk behavior that is monotonically increasing in the\noverparameterization ratio, eliminating both the blowup at the interpolation\npoint and the double-descent phenomenon. Interestingly, we observe that this\nimplicit regularization offered by the batch partition is partially explained\nby feature overlap between the batches. Our bound is derived via a novel\ncombination of techniques, in particular normal approximation in the\nWasserstein metric of noisy projections over random subspaces.\n","authors":["Shahar Stein Ioushua","Inbar Hasidim","Ofer Shayevitz","Meir Feder"],"pdf_url":"https://arxiv.org/pdf/2306.08432v2.pdf","comment":"55 pages"},{"id":"http://arxiv.org/abs/2308.03464v1","updated":"2023-08-07T10:43:48Z","published":"2023-08-07T10:43:48Z","title":"Wide Gaps and Clustering Axioms","summary":"  The widely applied k-means algorithm produces clusterings that violate our\nexpectations with respect to high/low similarity/density and is in conflict\nwith Kleinberg's axiomatic system for distance based clustering algorithms that\nformalizes those expectations in a natural way. k-means violates in particular\nthe consistency axiom. We hypothesise that this clash is due to the not\nexplicated expectation that the data themselves should have the property of\nbeing clusterable in order to expect the algorithm clustering hem to fit a\nclustering axiomatic system. To demonstrate this, we introduce two new\nclusterability properties, variational k-separability and residual\nk-separability and show that then the Kleinberg's consistency axiom holds for\nk-means operating in the Euclidean or non-Euclidean space. Furthermore, we\npropose extensions of k-means algorithm that fit approximately the Kleinberg's\nrichness axiom that does not hold for k-means. In this way, we reconcile\nk-means with Kleinberg's axiomatic framework in Euclidean and non-Euclidean\nsettings. Besides contribution to the theory of axiomatic frameworks of\nclustering and for clusterability theory, practical contribution is the\npossibility to construct {datasets for testing purposes of algorithms\noptimizing k-means cost function. This includes a method of construction of\n{clusterable data with known in advance global optimum.\n","authors":["Mieczysław A. Kłopotek"],"pdf_url":"https://arxiv.org/pdf/2308.03464v1.pdf","comment":"14 Theorems. arXiv admin note: substantial text overlap with\n  arXiv:2211.17036"},{"id":"http://arxiv.org/abs/2308.03457v1","updated":"2023-08-07T10:25:54Z","published":"2023-08-07T10:25:54Z","title":"Cross-Silo Prototypical Calibration for Federated Learning with Non-IID\n  Data","summary":"  Federated Learning aims to learn a global model on the server side that\ngeneralizes to all clients in a privacy-preserving manner, by leveraging the\nlocal models from different clients. Existing solutions focus on either\nregularizing the objective functions among clients or improving the aggregation\nmechanism for the improved model generalization capability. However, their\nperformance is typically limited by the dataset biases, such as the\nheterogeneous data distributions and the missing classes. To address this\nissue, this paper presents a cross-silo prototypical calibration method\n(FedCSPC), which takes additional prototype information from the clients to\nlearn a unified feature space on the server side. Specifically, FedCSPC first\nemploys the Data Prototypical Modeling (DPM) module to learn data patterns via\nclustering to aid calibration. Subsequently, the cross-silo prototypical\ncalibration (CSPC) module develops an augmented contrastive learning method to\nimprove the robustness of the calibration, which can effectively project\ncross-source features into a consistent space while maintaining clear decision\nboundaries. Moreover, the CSPC module's ease of implementation and\nplug-and-play characteristics make it even more remarkable. Experiments were\nconducted on four datasets in terms of performance comparison, ablation study,\nin-depth analysis and case study, and the results verified that FedCSPC is\ncapable of learning the consistent features across different data sources of\nthe same class under the guidance of calibrated model, which leads to better\nperformance than the state-of-the-art methods. The source codes have been\nreleased at https://github.com/qizhuang-qz/FedCSPC.\n","authors":["Zhuang Qi","Lei Meng","Zitan Chen","Han Hu","Hui Lin","Xiangxu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.03457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07176v2","updated":"2023-08-07T10:09:21Z","published":"2023-05-11T23:12:13Z","title":"Automatic Radiology Report Generation by Learning with Increasingly Hard\n  Negatives","summary":"  Automatic radiology report generation is challenging as medical images or\nreports are usually similar to each other due to the common content of anatomy.\nThis makes a model hard to capture the uniqueness of individual images and is\nprone to producing undesired generic or mismatched reports. This situation\ncalls for learning more discriminative features that could capture even\nfine-grained mismatches between images and reports. To achieve this, this paper\nproposes a novel framework to learn discriminative image and report features by\ndistinguishing them from their closest peers, i.e., hard negatives. Especially,\nto attain more discriminative features, we gradually raise the difficulty of\nsuch a learning task by creating increasingly hard negative reports for each\nimage in the feature space during training, respectively. By treating the\nincreasingly hard negatives as auxiliary variables, we formulate this process\nas a min-max alternating optimisation problem. At each iteration, conditioned\non a given set of hard negative reports, image and report features are learned\nas usual by minimising the loss functions related to report generation. After\nthat, a new set of harder negative reports will be created by maximising a loss\nreflecting image-report alignment. By solving this optimisation, we attain a\nmodel that can generate more specific and accurate reports. It is noteworthy\nthat our framework enhances discriminative feature learning without introducing\nextra network weights. Also, in contrast to the existing way of generating hard\nnegatives, our framework extends beyond the granularity of the dataset by\ngenerating harder samples out of the training set. Experimental study on\nbenchmark datasets verifies the efficacy of our framework and shows that it can\nserve as a plug-in to readily improve existing medical report generation\nmodels.\n","authors":["Bhanu Prakash Voutharoja","Lei Wang","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.07176v2.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n  2023"},{"id":"http://arxiv.org/abs/2306.07886v3","updated":"2023-08-07T10:01:49Z","published":"2023-06-13T16:25:30Z","title":"Symmetry & Critical Points for Symmetric Tensor Decomposition Problems","summary":"  We consider the nonconvex optimization problem associated with the\ndecomposition of a real symmetric tensor into a sum of rank one terms. Use is\nmade of the rich symmetry structure to construct infinite families of critical\npoints represented by Puiseux series in the problem dimension, and so obtain\nprecise analytic estimates on the value of the objective function and the\nHessian spectrum. The results allow an analytic characterization of various\nobstructions to using local optimization methods, revealing in particular a\ncomplex array of saddles and minima differing by their symmetry, structure and\nanalytic properties. A~desirable phenomenon, occurring for all critical points\nconsidered, concerns the number of negative Hessian eigenvalues increasing with\nthe value of the objective function. Our approach makes use of Newton polyhedra\nas well as results from real algebraic geometry, notably the Curve Selection\nLemma, to determine the extremal character of degenerate critical points,\nestablishing in particular the existence of infinite families of third-order\nsaddles which can significantly slow down the optimization process.\n","authors":["Yossi Arjevani","Gal Vinograd"],"pdf_url":"https://arxiv.org/pdf/2306.07886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03443v1","updated":"2023-08-07T10:00:07Z","published":"2023-08-07T10:00:07Z","title":"Doubly Robust Estimator for Off-Policy Evaluation with Large Action\n  Spaces","summary":"  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large\naction spaces. The benchmark estimators suffer from severe bias and variance\ntradeoffs. Parametric approaches suffer from bias due to difficulty specifying\nthe correct model, whereas ones with importance weight suffer from variance. To\novercome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was\nproposed to mitigate the estimator's variance via embeddings of an action. To\nmake the estimator more accurate, we propose the doubly robust estimator of\nMIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical\nanalysis shows that the proposed estimator is unbiased under weaker assumptions\nthan MIPS while maintaining variance reduction against IPS, which was the main\nadvantage of MIPS. The empirical experiment verifies the supremacy of MDR\nagainst existing estimators.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03443v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2301.09930v2","updated":"2023-08-07T09:48:44Z","published":"2023-01-24T11:27:17Z","title":"Quadruple-star systems are not always nested triples: a machine learning\n  approach to dynamical stability","summary":"  The dynamical stability of quadruple-star systems has traditionally been\ntreated as a problem involving two `nested' triples which constitute a\nquadruple. In this novel study, we employed a machine learning algorithm, the\nmulti-layer perceptron (MLP), to directly classify 2+2 and 3+1 quadruples based\non their stability (or long-term boundedness). The training data sets for the\nclassification, comprised of $5\\times10^5$ quadruples each, were integrated\nusing the highly accurate direct $N$-body code MSTAR. We also carried out a\nlimited parameter space study of zero-inclination systems to directly compare\nquadruples to triples. We found that both our quadruple MLP models perform\nbetter than a `nested' triple MLP approach, which is especially significant for\n3+1 quadruples. The classification accuracies for the 2+2 MLP and 3+1 MLP\nmodels are 94% and 93% respectively, while the scores for the `nested' triple\napproach are 88% and 66% respectively. This is a crucial implication for\nquadruple population synthesis studies. Our MLP models, which are very simple\nand almost instantaneous to implement, are available on GitHub, along with\nPython3 scripts to access them.\n","authors":["Pavan Vynatheya","Rosemary A. Mardling","Adrian S. Hamers"],"pdf_url":"https://arxiv.org/pdf/2301.09930v2.pdf","comment":"Accepted for publication by MNRAS"},{"id":"http://arxiv.org/abs/2306.09780v2","updated":"2023-08-07T09:25:55Z","published":"2023-06-16T11:33:47Z","title":"Understanding Deep Generative Models with Generalized Empirical\n  Likelihoods","summary":"  Understanding how well a deep generative model captures a distribution of\nhigh-dimensional data remains an important open challenge. It is especially\ndifficult for certain model classes, such as Generative Adversarial Networks\nand Diffusion Models, whose models do not admit exact likelihoods. In this\nwork, we demonstrate that generalized empirical likelihood (GEL) methods offer\na family of diagnostic tools that can identify many deficiencies of deep\ngenerative models (DGMs). We show, with appropriate specification of moment\nconditions, that the proposed method can identify which modes have been\ndropped, the degree to which DGMs are mode imbalanced, and whether DGMs\nsufficiently capture intra-class diversity. We show how to combine techniques\nfrom Maximum Mean Discrepancy and Generalized Empirical Likelihood to create\nnot only distribution tests that retain per-sample interpretability, but also\nmetrics that include label information. We find that such tests predict the\ndegree of mode dropping and mode imbalance up to 60% better than metrics such\nas improved precision/recall. We provide an implementation at\nhttps://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.\n","authors":["Suman Ravuri","Mélanie Rey","Shakir Mohamed","Marc Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2306.09780v2.pdf","comment":"Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of\n  submissions)"},{"id":"http://arxiv.org/abs/2210.14245v2","updated":"2023-08-07T09:09:48Z","published":"2022-10-25T18:00:25Z","title":"CaloFlow for CaloChallenge Dataset 1","summary":"  CaloFlow is a new and promising approach to fast calorimeter simulation based\non normalizing flows. Applying CaloFlow to the photon and charged pion Geant4\nshowers of Dataset 1 of the Fast Calorimeter Simulation Challenge 2022, we show\nhow it can produce high-fidelity samples with a sampling time that is several\norders of magnitude faster than Geant4. We demonstrate the fidelity of the\nsamples using calorimeter shower images, histograms of high-level features, and\naggregate metrics such as a classifier trained to distinguish CaloFlow from\nGeant4 samples.\n","authors":["Claudius Krause","Ian Pang","David Shih"],"pdf_url":"https://arxiv.org/pdf/2210.14245v2.pdf","comment":"32 pages, 18 figures, v2: updated pion evaluation"},{"id":"http://arxiv.org/abs/2308.03417v1","updated":"2023-08-07T09:08:39Z","published":"2023-08-07T09:08:39Z","title":"PURL: Safe and Effective Sanitization of Link Decoration","summary":"  While privacy-focused browsers have taken steps to block third-party cookies\nand browser fingerprinting, novel tracking methods that bypass existing\ndefenses continue to emerge. Since trackers need to exfiltrate information from\nthe client- to server-side through link decoration regardless of the tracking\ntechnique they employ, a promising orthogonal approach is to detect and\nsanitize tracking information in decorated links. We present PURL, a\nmachine-learning approach that leverages a cross-layer graph representation of\nwebpage execution to safely and effectively sanitize link decoration. Our\nevaluation shows that PURL significantly outperforms existing countermeasures\nin terms of accuracy and reducing website breakage while being robust to common\nevasion techniques. We use PURL to perform a measurement study on top-million\nwebsites. We find that link decorations are widely abused by well-known\nadvertisers and trackers to exfiltrate user information collected from browser\nstorage, email addresses, and scripts involved in fingerprinting.\n","authors":["Shaoor Munir","Patrick Lee","Umar Iqbal","Zubair Shafiq","Sandra Siby"],"pdf_url":"https://arxiv.org/pdf/2308.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12414v3","updated":"2023-08-07T08:54:11Z","published":"2023-03-22T09:23:29Z","title":"Delay-Aware Hierarchical Federated Learning","summary":"  Federated learning has gained popularity as a means of training models\ndistributed across the wireless edge. The paper introduces delay-aware\nhierarchical federated learning (DFL) to improve the efficiency of distributed\nmachine learning (ML) model training by accounting for communication delays\nbetween edge and cloud. Different from traditional federated learning, DFL\nleverages multiple stochastic gradient descent iterations on device datasets\nwithin each global aggregation period and intermittently aggregates model\nparameters through edge servers in local subnetworks. During global\nsynchronization, the cloud server consolidates local models with the outdated\nglobal model using a local-global combiner, thus preserving crucial elements of\nboth, enhancing learning efficiency under the presence of delay. A set of\nconditions is obtained to achieve the sub-linear convergence rate of O(1/k).\nBased on these findings, an adaptive control algorithm is developed for DFL,\nimplementing policies to mitigate energy consumption and communication latency\nwhile aiming for a sublinear convergence rate. Numerical evaluations show DFL's\nsuperior performance in terms of faster global model convergence, reduced\nresource consumption, and robustness against communication delays compared to\nexisting FL algorithms. In summary, this proposed method offers improved\nefficiency and results when dealing with both convex and non-convex loss\nfunctions.\n","authors":["Frank Po-Chen Lin","Seyyedali Hosseinalipour","Nicolò Michelusi","Christopher Brinton"],"pdf_url":"https://arxiv.org/pdf/2303.12414v3.pdf","comment":"A condensed version of this paper was presented at IEEE Globecom 2020"},{"id":"http://arxiv.org/abs/2308.03404v1","updated":"2023-08-07T08:46:10Z","published":"2023-08-07T08:46:10Z","title":"Applied metamodelling for ATM performance simulations","summary":"  The use of Air traffic management (ATM) simulators for planing and operations\ncan be challenging due to their modelling complexity. This paper presents XALM\n(eXplainable Active Learning Metamodel), a three-step framework integrating\nactive learning and SHAP (SHapley Additive exPlanations) values into simulation\nmetamodels for supporting ATM decision-making. XALM efficiently uncovers hidden\nrelationships among input and output variables in ATM simulators, those usually\nof interest in policy analysis. Our experiments show XALM's predictive\nperformance comparable to the XGBoost metamodel with fewer simulations.\nAdditionally, XALM exhibits superior explanatory capabilities compared to\nnon-active learning metamodels.\n  Using the `Mercury' (flight and passenger) ATM simulator, XALM is applied to\na real-world scenario in Paris Charles de Gaulle airport, extending an arrival\nmanager's range and scope by analysing six variables. This case study\nillustrates XALM's effectiveness in enhancing simulation interpretability and\nunderstanding variable interactions. By addressing computational challenges and\nimproving explainability, XALM complements traditional simulation-based\nanalyses.\n  Lastly, we discuss two practical approaches for reducing the computational\nburden of the metamodelling further: we introduce a stopping criterion for\nactive learning based on the inherent uncertainty of the metamodel, and we show\nhow the simulations used for the metamodel can be reused across key performance\nindicators, thus decreasing the overall number of simulations needed.\n","authors":["Christoffer Riis","Francisco N. Antunes","Tatjana Bolić","Gérald Gurtner","Andrew Cook","Carlos Lima Azevedo","Francisco Câmara Pereira"],"pdf_url":"https://arxiv.org/pdf/2308.03404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03403v1","updated":"2023-08-07T08:44:15Z","published":"2023-08-07T08:44:15Z","title":"Towards Machine Learning-based Fish Stock Assessment","summary":"  The accurate assessment of fish stocks is crucial for sustainable fisheries\nmanagement. However, existing statistical stock assessment models can have low\nforecast performance of relevant stock parameters like recruitment or spawning\nstock biomass, especially in ecosystems that are changing due to global warming\nand other anthropogenic stressors. In this paper, we investigate the use of\nmachine learning models to improve the estimation and forecast of such stock\nparameters. We propose a hybrid model that combines classical statistical stock\nassessment models with supervised ML, specifically gradient boosted trees. Our\nhybrid model leverages the initial estimate provided by the classical model and\nuses the ML model to make a post-hoc correction to improve accuracy. We\nexperiment with five different stocks and find that the forecast accuracy of\nrecruitment and spawning stock biomass improves considerably in most cases.\n","authors":["Stefan Lüdtke","Maria E. Pierce"],"pdf_url":"https://arxiv.org/pdf/2308.03403v1.pdf","comment":"Accepted at Fragile Earth Workshop 2023"},{"id":"http://arxiv.org/abs/2307.12306v2","updated":"2023-08-07T08:36:45Z","published":"2023-07-23T12:18:12Z","title":"Tackling the Curse of Dimensionality with Physics-Informed Neural\n  Networks","summary":"  The curse-of-dimensionality (CoD) taxes computational resources heavily with\nexponentially increasing computational cost as the dimension increases. This\nposes great challenges in solving high-dimensional PDEs as Richard Bellman\nfirst pointed out over 60 years ago. While there has been some recent success\nin solving numerically partial differential equations (PDEs) in high\ndimensions, such computations are prohibitively expensive, and true scaling of\ngeneral nonlinear PDEs to high dimensions has never been achieved. In this\npaper, we develop a new method of scaling up physics-informed neural networks\n(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called\nStochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs\ninto pieces corresponding to different dimensions and samples randomly a subset\nof these dimensional pieces in each iteration of training PINNs. We\ntheoretically prove the convergence guarantee and other desired properties of\nthe proposed method. We experimentally demonstrate that the proposed method\nallows us to solve many notoriously hard high-dimensional PDEs, including the\nHamilton-Jacobi-Bellman (HJB) and the Schr\\\"{o}dinger equations in thousands of\ndimensions very fast on a single GPU using the PINNs mesh-free approach. For\ninstance, we solve nontrivial nonlinear PDEs (one HJB equation and one\nBlack-Scholes equation) in 100,000 dimensions in 6 hours on a single GPU using\nSDGD with PINNs. Since SDGD is a general training methodology of PINNs, SDGD\ncan be applied to any current and future variants of PINNs to scale them up for\narbitrary high-dimensional PDEs.\n","authors":["Zheyuan Hu","Khemraj Shukla","George Em Karniadakis","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.12306v2.pdf","comment":"37 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.03382v1","updated":"2023-08-07T08:03:20Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n  Residual U-Blocks Network","summary":"  Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v1.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n  imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2304.14104v2","updated":"2023-08-07T07:52:35Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n  Supervision","summary":"  Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v2.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n  https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2302.02807v2","updated":"2023-08-07T07:43:37Z","published":"2023-02-06T14:31:51Z","title":"Federated Survival Forests","summary":"  Survival analysis is a subfield of statistics concerned with modeling the\noccurrence time of a particular event of interest for a population. Survival\nanalysis found widespread applications in healthcare, engineering, and social\nsciences. However, real-world applications involve survival datasets that are\ndistributed, incomplete, censored, and confidential. In this context, federated\nlearning can tremendously improve the performance of survival analysis\napplications. Federated learning provides a set of privacy-preserving\ntechniques to jointly train machine learning models on multiple datasets\nwithout compromising user privacy, leading to a better generalization\nperformance. However, despite the widespread development of federated learning\nin recent AI research, few studies focus on federated survival analysis. In\nthis work, we present a novel federated algorithm for survival analysis based\non one of the most successful survival models, the random survival forest. We\ncall the proposed method Federated Survival Forest (FedSurF). With a single\ncommunication round, FedSurF obtains a discriminative power comparable to\ndeep-learning-based federated models trained over hundreds of federated\niterations. Moreover, FedSurF retains all the advantages of random forests,\nnamely low computational cost and natural handling of missing values and\nincomplete datasets. These advantages are especially desirable in real-world\nfederated environments with multiple small datasets stored on devices with low\ncomputational capabilities. Numerical experiments compare FedSurF with\nstate-of-the-art survival models in federated networks, showing how FedSurF\noutperforms deep-learning-based federated algorithms in realistic environments\nwith non-identically distributed data.\n","authors":["Alberto Archetti","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2302.02807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05628v2","updated":"2023-08-07T07:41:47Z","published":"2023-07-11T06:30:43Z","title":"DNAGPT: A Generalized Pre-trained Tool for Versatile DNA Sequence\n  Analysis Tasks","summary":"  GPT has been proven to be capable of extracting general information from\nlanguage sequences, thereby benefiting all downstream tasks. This motivates us\nto use pre-trained models to explore the hidden inherent information in DNA\nsequences. However, data and task requirements in DNA sequence analyses are\ntasked in different formats such as generation, prediction and regression, and\nare complexity and involve different modalities, such as nucleotides sequences\nand, expression levels, etc. Existing BERT-based models are mostly for\ngeneration tasks and use sequence data as input and output, thus cannot easily\nhandle various DNA analysis tasks in one single model. Herein, we propose a\ngeneralized DNA pre-training DNA model, DNAGPT, that was trained on over 200\nbillion base pairs from all the mammals. We enhance the classic GPT model by\nadding binary classification task (DNA sequence order) and numerical regression\ntask (guanine-cytosine content prediction) in the pre-training period and\nenhancing the architecture with corresponding embedding layers and encoding\nheads. We also design a comprehensive token language to encode sequence, number\nand task related information in the same token space. Therefore, DNAGPT can\nhandle versatile DNA analysis tasks and simultaneously process handle both\nsequence and numerical data. We have evaluated our model on genomic signals and\nregions recognition, pseudo genomes generation and mRNA abudance regression\ntasks. We demonstrate that benefiting from pre-training, DNAGPT can shows\nsuperior performance than the existing models specially designed for various\ndownstreams tasks.\n","authors":["Daoan Zhang","Weitong Zhang","Bing He","Yu Zhao","Jianguo Zhang","Chenchen Qin","Jianhua Yao"],"pdf_url":"https://arxiv.org/pdf/2307.05628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03363v1","updated":"2023-08-07T07:37:26Z","published":"2023-08-07T07:37:26Z","title":"A reading survey on adversarial machine learning: Adversarial attacks\n  and their understanding","summary":"  Deep Learning has empowered us to train neural networks for complex data with\nhigh performance. However, with the growing research, several vulnerabilities\nin neural networks have been exposed. A particular branch of research,\nAdversarial Machine Learning, exploits and understands some of the\nvulnerabilities that cause the neural networks to misclassify for near original\ninput. A class of algorithms called adversarial attacks is proposed to make the\nneural networks misclassify for various tasks in different domains. With the\nextensive and growing research in adversarial attacks, it is crucial to\nunderstand the classification of adversarial attacks. This will help us\nunderstand the vulnerabilities in a systematic order and help us to mitigate\nthe effects of adversarial attacks. This article provides a survey of existing\nadversarial attacks and their understanding based on different perspectives. We\nalso provide a brief overview of existing adversarial defences and their\nlimitations in mitigating the effect of adversarial attacks. Further, we\nconclude with a discussion on the future research directions in the field of\nadversarial machine learning.\n","authors":["Shashank Kotyan"],"pdf_url":"https://arxiv.org/pdf/2308.03363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11312v3","updated":"2023-08-07T07:20:51Z","published":"2022-07-22T19:38:25Z","title":"HybMT: Hybrid Meta-Predictor based ML Algorithm for Fast Test Vector\n  Generation","summary":"  ML models are increasingly being used to increase the test coverage and\ndecrease the overall testing time. This field is still in its nascent stage and\nup till now there were no algorithms that could match or outperform commercial\ntools in terms of speed and accuracy for large circuits. We propose an ATPG\nalgorithm HybMT in this paper that finally breaks this barrier. Like sister\nmethods, we augment the classical PODEM algorithm that uses recursive\nbacktracking. We design a custom 2-level predictor that predicts the input net\nof a logic gate whose value needs to be set to ensure that the output is a\ngiven value (0 or 1). Our predictor chooses the output from among two\nfirst-level predictors, where the most effective one is a bespoke neural\nnetwork and the other is an SVM regressor. As compared to a popular,\nstate-of-the-art commercial ATPG tool, HybMT shows an overall reduction of\n56.6% in the CPU time without compromising on the fault coverage for the EPFL\nbenchmark circuits. HybMT also shows a speedup of 126.4% over the best ML-based\nalgorithm while obtaining an equal or better fault coverage for the EPFL\nbenchmark circuits.\n","authors":["Shruti Pandey"," Jayadeva","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2207.11312v3.pdf","comment":"6 pages, 5 figures and 5 tables. Changes from the previous version:\n  We modified our novel neural network model \"HybNN\" with a skip connection and\n  found a significant improvement in the fault coverage and runtime of our\n  HybMT-based PODEM algorithm. We train on the smaller ISCAS'85 circuits,\n  report the results for the EPFL benchmark circuits (most recent and up to 70X\n  large)"},{"id":"http://arxiv.org/abs/2303.01254v3","updated":"2023-08-07T07:07:25Z","published":"2023-02-13T10:33:21Z","title":"Privacy-Preserving Tree-Based Inference with TFHE","summary":"  Privacy enhancing technologies (PETs) have been proposed as a way to protect\nthe privacy of data while still allowing for data analysis. In this work, we\nfocus on Fully Homomorphic Encryption (FHE), a powerful tool that allows for\narbitrary computations to be performed on encrypted data. FHE has received lots\nof attention in the past few years and has reached realistic execution times\nand correctness.\n  More precisely, we explain in this paper how we apply FHE to tree-based\nmodels and get state-of-the-art solutions over encrypted tabular data. We show\nthat our method is applicable to a wide range of tree-based models, including\ndecision trees, random forests, and gradient boosted trees, and has been\nimplemented within the Concrete-ML library, which is open-source at\nhttps://github.com/zama-ai/concrete-ml. With a selected set of use-cases, we\ndemonstrate that our FHE version is very close to the unprotected version in\nterms of accuracy.\n","authors":["Jordan Frery","Andrei Stoian","Roman Bredehoft","Luis Montero","Celia Kherfallah","Benoit Chevallier-Mames","Arthur Meyre"],"pdf_url":"https://arxiv.org/pdf/2303.01254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10510v2","updated":"2023-08-07T06:40:13Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n  Synthetic-to-Real Adaptation","summary":"  Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v2.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n  pages, 25 figures, 7 tables. Project page:\n  https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2308.03337v1","updated":"2023-08-07T06:38:59Z","published":"2023-08-07T06:38:59Z","title":"Solving Falkner-Skan type equations via Legendre and Chebyshev Neural\n  Blocks","summary":"  In this paper, a new deep-learning architecture for solving the non-linear\nFalkner-Skan equation is proposed. Using Legendre and Chebyshev neural blocks,\nthis approach shows how orthogonal polynomials can be used in neural networks\nto increase the approximation capability of artificial neural networks. In\naddition, utilizing the mathematical properties of these functions, we overcome\nthe computational complexity of the backpropagation algorithm by using the\noperational matrices of the derivative. The efficiency of the proposed method\nis carried out by simulating various configurations of the Falkner-Skan\nequation.\n","authors":["Alireza Afzal Aghaei","Kourosh Parand","Ali Nikkhah","Shakila Jaberi"],"pdf_url":"https://arxiv.org/pdf/2308.03337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03960v2","updated":"2023-08-07T06:35:25Z","published":"2023-05-06T07:06:47Z","title":"Beyond Rule-based Named Entity Recognition and Relation Extraction for\n  Process Model Generation from Natural Language Text","summary":"  Process-aware information systems offer extensive advantages to companies,\nfacilitating planning, operations, and optimization of day-to-day business\nactivities. However, the time-consuming but required step of designing formal\nbusiness process models often hampers the potential of these systems. To\novercome this challenge, automated generation of business process models from\nnatural language text has emerged as a promising approach to expedite this\nstep. Generally two crucial subtasks have to be solved: extracting\nprocess-relevant information from natural language and creating the actual\nmodel. Approaches towards the first subtask are rule based methods, highly\noptimized for specific domains, but hard to adapt to related applications. To\nsolve this issue, we present an extension to an existing pipeline, to make it\nentirely data driven. We demonstrate the competitiveness of our improved\npipeline, which not only eliminates the substantial overhead associated with\nfeature engineering and rule definition, but also enables adaptation to\ndifferent datasets, entity and relation types, and new domains. Additionally,\nthe largest available dataset (PET) for the first subtask, contains no\ninformation about linguistic references between mentions of entities in the\nprocess description. Yet, the resolution of these mentions into a single visual\nelement is essential for high quality process models. We propose an extension\nto the PET dataset that incorporates information about linguistic references\nand a corresponding method for resolving them. Finally, we provide a detailed\nanalysis of the inherent challenges in the dataset at hand.\n","authors":["Julian Neuberger","Lars Ackermann","Stefan Jablonski"],"pdf_url":"https://arxiv.org/pdf/2305.03960v2.pdf","comment":"Currently under review for CoopIS23"},{"id":"http://arxiv.org/abs/2305.18462v2","updated":"2023-08-07T06:32:56Z","published":"2023-05-29T07:06:03Z","title":"Membership Inference Attacks against Language Models via Neighbourhood\n  Comparison","summary":"  Membership Inference attacks (MIAs) aim to predict whether a data sample was\npresent in the training data of a machine learning model or not, and are widely\nused for assessing the privacy risks of language models. Most existing attacks\nrely on the observation that models tend to assign higher probabilities to\ntheir training samples than non-training points. However, simple thresholding\nof the model score in isolation tends to lead to high false-positive rates as\nit does not account for the intrinsic complexity of a sample. Recent work has\ndemonstrated that reference-based attacks which compare model scores to those\nobtained from a reference model trained on similar data can substantially\nimprove the performance of MIAs. However, in order to train reference models,\nattacks of this kind make the strong and arguably unrealistic assumption that\nan adversary has access to samples closely resembling the original training\ndata. Therefore, we investigate their performance in more realistic scenarios\nand find that they are highly fragile in relation to the data distribution used\nto train reference models. To investigate whether this fragility provides a\nlayer of safety, we propose and evaluate neighbourhood attacks, which compare\nmodel scores for a given sample to scores of synthetically generated neighbour\ntexts and therefore eliminate the need for access to the training data\ndistribution. We show that, in addition to being competitive with\nreference-based attacks that have perfect knowledge about the training data\ndistribution, our attack clearly outperforms existing reference-free attacks as\nwell as reference-based attacks with imperfect knowledge, which demonstrates\nthe need for a reevaluation of the threat model of adversarial attacks.\n","authors":["Justus Mattern","Fatemehsadat Mireshghallah","Zhijing Jin","Bernhard Schölkopf","Mrinmaya Sachan","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2305.18462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03330v1","updated":"2023-08-07T06:23:24Z","published":"2023-08-07T06:23:24Z","title":"Expediting Neural Network Verification via Network Reduction","summary":"  A wide range of verification methods have been proposed to verify the safety\nproperties of deep neural networks ensuring that the networks function\ncorrectly in critical applications. However, many well-known verification tools\nstill struggle with complicated network architectures and large network sizes.\nIn this work, we propose a network reduction technique as a pre-processing\nmethod prior to verification. The proposed method reduces neural networks via\neliminating stable ReLU neurons, and transforming them into a sequential neural\nnetwork consisting of ReLU and Affine layers which can be handled by the most\nverification tools. We instantiate the reduction technique on the\nstate-of-the-art complete and incomplete verification tools, including\nalpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of\nbenchmarks indicate that the proposed technique can significantly reduce neural\nnetworks and speed up existing verification tools. Furthermore, the experiment\nresults also show that network reduction can improve the availability of\nexisting verification tools on many networks by reducing them into sequential\nneural networks.\n","authors":["Yuyi Zhong","Ruiwei Wang","Siau-Cheng Khoo"],"pdf_url":"https://arxiv.org/pdf/2308.03330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07912v2","updated":"2023-08-07T06:20:31Z","published":"2023-01-19T06:46:36Z","title":"Interval Reachability of Nonlinear Dynamical Systems with Neural Network\n  Controllers","summary":"  This paper proposes a computationally efficient framework, based on interval\nanalysis, for rigorous verification of nonlinear continuous-time dynamical\nsystems with neural network controllers. Given a neural network, we use an\nexisting verification algorithm to construct inclusion functions for its\ninput-output behavior. Inspired by mixed monotone theory, we embed the\nclosed-loop dynamics into a larger system using an inclusion function of the\nneural network and a decomposition function of the open-loop system. This\nembedding provides a scalable approach for safety analysis of the neural\ncontrol loop while preserving the nonlinear structure of the system.\n  We show that one can efficiently compute hyper-rectangular\nover-approximations of the reachable sets using a single trajectory of the\nembedding system. We design an algorithm to leverage this computational\nadvantage through partitioning strategies, improving our reachable set\nestimates while balancing its runtime with tunable parameters. We demonstrate\nthe performance of this algorithm through two case studies. First, we\ndemonstrate this method's strength in complex nonlinear environments. Then, we\nshow that our approach matches the performance of the state-of-the art\nverification algorithm for linear discretized systems.\n","authors":["Saber Jafarpour","Akash Harapanahalli","Samuel Coogan"],"pdf_url":"https://arxiv.org/pdf/2301.07912v2.pdf","comment":"Extended L4DC version with proofs"},{"id":"http://arxiv.org/abs/2308.03321v1","updated":"2023-08-07T06:08:51Z","published":"2023-08-07T06:08:51Z","title":"AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework","summary":"  The success of deep learning is inseparable from normalization layers.\nResearchers have proposed various normalization functions, and each of them has\nboth advantages and disadvantages. In response, efforts have been made to\ndesign a unified normalization function that combines all normalization\nprocedures and mitigates their weaknesses. We also proposed a new normalization\nfunction called Adaptive Fusion Normalization. Through experiments, we\ndemonstrate AFN outperforms the previous normalization techniques in domain\ngeneralization and image classification tasks.\n","authors":["Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03321v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.01899 by other authors"},{"id":"http://arxiv.org/abs/2308.03320v1","updated":"2023-08-07T06:07:04Z","published":"2023-08-07T06:07:04Z","title":"Binary Federated Learning with Client-Level Differential Privacy","summary":"  Federated learning (FL) is a privacy-preserving collaborative learning\nframework, and differential privacy can be applied to further enhance its\nprivacy protection. Existing FL systems typically adopt Federated Average\n(FedAvg) as the training algorithm and implement differential privacy with a\nGaussian mechanism. However, the inherent privacy-utility trade-off in these\nsystems severely degrades the training performance if a tight privacy budget is\nenforced. Besides, the Gaussian mechanism requires model weights to be of\nhigh-precision. To improve communication efficiency and achieve a better\nprivacy-utility trade-off, we propose a communication-efficient FL training\nalgorithm with differential privacy guarantee. Specifically, we propose to\nadopt binary neural networks (BNNs) and introduce discrete noise in the FL\nsetting. Binary model parameters are uploaded for higher communication\nefficiency and discrete noise is added to achieve the client-level differential\nprivacy protection. The achieved performance guarantee is rigorously proved,\nand it is shown to depend on the level of discrete noise. Experimental results\nbased on MNIST and Fashion-MNIST datasets will demonstrate that the proposed\ntraining algorithm achieves client-level privacy protection with performance\ngain while enjoying the benefits of low communication overhead from binary\nmodel updates.\n","authors":["Lumin Liu","Jun Zhang","Shenghui Song","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2308.03320v1.pdf","comment":"6 pages, 6 figures, accepted by IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2308.03317v1","updated":"2023-08-07T06:01:50Z","published":"2023-08-07T06:01:50Z","title":"HomOpt: A Homotopy-Based Hyperparameter Optimization Method","summary":"  Machine learning has achieved remarkable success over the past couple of\ndecades, often attributed to a combination of algorithmic innovations and the\navailability of high-quality data available at scale. However, a third critical\ncomponent is the fine-tuning of hyperparameters, which plays a pivotal role in\nachieving optimal model performance. Despite its significance, hyperparameter\noptimization (HPO) remains a challenging task for several reasons. Many HPO\ntechniques rely on naive search methods or assume that the loss function is\nsmooth and continuous, which may not always be the case. Traditional methods,\nlike grid search and Bayesian optimization, often struggle to quickly adapt and\nefficiently search the loss landscape. Grid search is computationally\nexpensive, while Bayesian optimization can be slow to prime. Since the search\nspace for HPO is frequently high-dimensional and non-convex, it is often\nchallenging to efficiently find a global minimum. Moreover, optimal\nhyperparameters can be sensitive to the specific dataset or task, further\ncomplicating the search process. To address these issues, we propose a new\nhyperparameter optimization method, HomOpt, using a data-driven approach based\non a generalized additive model (GAM) surrogate combined with homotopy\noptimization. This strategy augments established optimization methodologies to\nboost the performance and effectiveness of any given method with faster\nconvergence to the optimum on continuous, discrete, and categorical domain\nspaces. We compare the effectiveness of HomOpt applied to multiple optimization\ntechniques (e.g., Random Search, TPE, Bayes, and SMAC) showing improved\nobjective performance on many standardized machine learning benchmarks and\nchallenging open-set recognition tasks.\n","authors":["Sophia J. Abraham","Kehelwala D. G. Maduranga","Jeffery Kinnison","Zachariah Carmichael","Jonathan D. Hauenstein","Walter J. Scheirer"],"pdf_url":"https://arxiv.org/pdf/2308.03317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03316v1","updated":"2023-08-07T05:58:40Z","published":"2023-08-07T05:58:40Z","title":"Deep Q-Network for Stochastic Process Environments","summary":"  Reinforcement learning is a powerful approach for training an optimal policy\nto solve complex problems in a given system. This project aims to demonstrate\nthe application of reinforcement learning in stochastic process environments\nwith missing information, using Flappy Bird and a newly developed stock trading\nenvironment as case studies. We evaluate various structures of Deep Q-learning\nnetworks and identify the most suitable variant for the stochastic process\nenvironment. Additionally, we discuss the current challenges and propose\npotential improvements for further work in environment-building and\nreinforcement learning techniques.\n","authors":["Kuangheng He"],"pdf_url":"https://arxiv.org/pdf/2308.03316v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.03724v2","updated":"2023-08-07T05:52:36Z","published":"2023-03-07T08:16:46Z","title":"Learning Bipedal Walking for Humanoids with Current Feedback","summary":"  Recent advances in deep reinforcement learning (RL) based techniques combined\nwith training in simulation have offered a new approach to developing robust\ncontrollers for legged robots. However, the application of such approaches to\nreal hardware has largely been limited to quadrupedal robots with direct-drive\nactuators and light-weight bipedal robots with low gear-ratio transmission\nsystems. Application to real, life-sized humanoid robots has been less common\narguably due to a large sim2real gap. In this paper, we present an approach for\neffectively overcoming the sim2real gap issue for humanoid robots arising from\ninaccurate torque-tracking at the actuator level. Our key idea is to utilize\nthe current feedback from the actuators on the real robot, after training the\npolicy in a simulation environment artificially degraded with poor\ntorque-tracking. Our approach successfully trains a unified, end-to-end policy\nin simulation that can be deployed on a real HRP-5P humanoid robot to achieve\nbipedal locomotion. Through ablations, we also show that a feedforward policy\narchitecture combined with targeted dynamics randomization is sufficient for\nzero-shot sim2real success, thus eliminating the need for computationally\nexpensive, memory-based network architectures. Finally, we validate the\nrobustness of the proposed RL policy by comparing its performance against a\nconventional model-based controller for walking on uneven terrain with the real\nrobot.\n","authors":["Rohan Pratap Singh","Zhaoming Xie","Pierre Gergondet","Fumio Kanehiro"],"pdf_url":"https://arxiv.org/pdf/2303.03724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03312v1","updated":"2023-08-07T05:40:58Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":"  Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n  Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03306v1","updated":"2023-08-07T05:22:33Z","published":"2023-08-07T05:22:33Z","title":"Implicit Graph Neural Diffusion Based on Constrained Dirichlet Energy\n  Minimization","summary":"  Implicit graph neural networks (GNNs) have emerged as a potential approach to\nenable GNNs to capture long-range dependencies effectively. However, poorly\ndesigned implicit GNN layers can experience over-smoothing or may have limited\nadaptability to learn data geometry, potentially hindering their performance in\ngraph learning problems. To address these issues, we introduce a geometric\nframework to design implicit graph diffusion layers based on a parameterized\ngraph Laplacian operator. Our framework allows learning the geometry of vertex\nand edge spaces, as well as the graph gradient operator from data. We further\nshow how implicit GNN layers can be viewed as the fixed-point solution of a\nDirichlet energy minimization problem and give conditions under which it may\nsuffer from over-smoothing. To overcome the over-smoothing problem, we design\nour implicit graph diffusion layer as the solution of a Dirichlet energy\nminimization problem with constraints on vertex features, enabling it to trade\noff smoothing with the preservation of node feature information. With an\nappropriate hyperparameter set to be larger than the largest eigenvalue of the\nparameterized graph Laplacian, our framework guarantees a unique equilibrium\nand quick convergence. Our models demonstrate better performance than leading\nimplicit and explicit GNNs on benchmark datasets for node and graph\nclassification tasks, with substantial accuracy improvements observed for some\ndatasets.\n","authors":["Guoji Fu","Mohammed Haroon Dupty","Yanfei Dong","Lee Wee Sun"],"pdf_url":"https://arxiv.org/pdf/2308.03306v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2308.03300v1","updated":"2023-08-07T05:05:49Z","published":"2023-08-07T05:05:49Z","title":"Do You Remember? Overcoming Catastrophic Forgetting for Fake Audio\n  Detection","summary":"  Current fake audio detection algorithms have achieved promising performances\non most datasets. However, their performance may be significantly degraded when\ndealing with audio of a different dataset. The orthogonal weight modification\nto overcome catastrophic forgetting does not consider the similarity of genuine\naudio across different datasets. To overcome this limitation, we propose a\ncontinual learning algorithm for fake audio detection to overcome catastrophic\nforgetting, called Regularized Adaptive Weight Modification (RAWM). When\nfine-tuning a detection network, our approach adaptively computes the direction\nof weight modification according to the ratio of genuine utterances and fake\nutterances. The adaptive modification direction ensures the network can\neffectively detect fake audio on the new dataset while preserving its knowledge\nof old model, thus mitigating catastrophic forgetting. In addition, genuine\naudio collected from quite different acoustic conditions may skew their feature\ndistribution, so we introduce a regularization constraint to force the network\nto remember the old distribution in this regard. Our method can easily be\ngeneralized to related fields, like speech emotion recognition. We also\nevaluate our approach across multiple datasets and obtain a significant\nperformance improvement on cross-dataset experiments.\n","authors":["Xiaohui Zhang","Jiangyan Yi","Jianhua Tao","Chenglong Wang","Chuyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03300v1.pdf","comment":"40th Internation Conference on Machine Learning (ICML 2023)"},{"id":"http://arxiv.org/abs/2308.03296v1","updated":"2023-08-07T04:47:42Z","published":"2023-08-07T04:47:42Z","title":"Studying Large Language Model Generalization with Influence Functions","summary":"  When trying to gain better visibility into a machine learning model in order\nto understand and mitigate the associated risks, a potentially valuable source\nof evidence is: which training examples most contribute to a given behavior?\nInfluence functions aim to answer a counterfactual: how would the model's\nparameters (and hence its outputs) change if a given sequence were added to the\ntraining set? While influence functions have produced insights for small\nmodels, they are difficult to scale to large language models (LLMs) due to the\ndifficulty of computing an inverse-Hessian-vector product (IHVP). We use the\nEigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)\napproximation to scale influence functions up to LLMs with up to 52 billion\nparameters. In our experiments, EK-FAC achieves similar accuracy to traditional\ninfluence function estimators despite the IHVP computation being orders of\nmagnitude faster. We investigate two algorithmic techniques to reduce the cost\nof computing gradients of candidate training sequences: TF-IDF filtering and\nquery batching. We use influence functions to investigate the generalization\npatterns of LLMs, including the sparsity of the influence patterns, increasing\nabstraction with scale, math and programming abilities, cross-lingual\ngeneralization, and role-playing behavior. Despite many apparently\nsophisticated forms of generalization, we identify a surprising limitation:\ninfluences decay to near-zero when the order of key phrases is flipped.\nOverall, influence functions give us a powerful new tool for studying the\ngeneralization properties of LLMs.\n","authors":["Roger Grosse","Juhan Bae","Cem Anil","Nelson Elhage","Alex Tamkin","Amirhossein Tajdini","Benoit Steiner","Dustin Li","Esin Durmus","Ethan Perez","Evan Hubinger","Kamilė Lukošiūtė","Karina Nguyen","Nicholas Joseph","Sam McCandlish","Jared Kaplan","Samuel R. Bowman"],"pdf_url":"https://arxiv.org/pdf/2308.03296v1.pdf","comment":"119 pages, 47 figures, 22 tables"},{"id":"http://arxiv.org/abs/2308.01814v2","updated":"2023-08-07T04:47:32Z","published":"2023-08-03T15:22:51Z","title":"Tensor Programs IVb: Adaptive Optimization in the Infinite-Width Limit","summary":"  Going beyond stochastic gradient descent (SGD), what new phenomena emerge in\nwide neural networks trained by adaptive optimizers like Adam? Here we show:\nThe same dichotomy between feature learning and kernel behaviors (as in SGD)\nholds for general optimizers as well, including Adam -- albeit with a nonlinear\nnotion of \"kernel.\" We derive the corresponding \"neural tangent\" and \"maximal\nupdate\" limits for any architecture. Two foundational advances underlie the\nabove results: 1) A new Tensor Program language, NEXORT, that can express how\nadaptive optimizers process gradients into updates. 2) The introduction of\nbra-ket notation to drastically simplify expressions and calculations in Tensor\nPrograms. This work summarizes and generalizes all previous results in the\nTensor Programs series of papers.\n","authors":["Greg Yang","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2308.01814v2.pdf","comment":"This is the complete version of \"Adaptive Optimization in the\n  Infinite-Width Limit\" in ICLR 2023,\n  https://openreview.net/forum?id=zgVDqw9ZUES"},{"id":"http://arxiv.org/abs/2308.03295v1","updated":"2023-08-07T04:44:12Z","published":"2023-08-07T04:44:12Z","title":"DOMINO: Domain-invariant Hyperdimensional Classification for\n  Multi-Sensor Time Series Data","summary":"  With the rapid evolution of the Internet of Things, many real-world\napplications utilize heterogeneously connected sensors to capture time-series\ninformation. Edge-based machine learning (ML) methodologies are often employed\nto analyze locally collected data. However, a fundamental issue across\ndata-driven ML approaches is distribution shift. It occurs when a model is\ndeployed on a data distribution different from what it was trained on, and can\nsubstantially degrade model performance. Additionally, increasingly\nsophisticated deep neural networks (DNNs) have been proposed to capture spatial\nand temporal dependencies in multi-sensor time series data, requiring intensive\ncomputational resources beyond the capacity of today's edge devices. While\nbrain-inspired hyperdimensional computing (HDC) has been introduced as a\nlightweight solution for edge-based learning, existing HDCs are also vulnerable\nto the distribution shift challenge. In this paper, we propose DOMINO, a novel\nHDC learning framework addressing the distribution shift problem in noisy\nmulti-sensor time-series data. DOMINO leverages efficient and parallel matrix\noperations on high-dimensional space to dynamically identify and filter out\ndomain-variant dimensions. Our evaluation on a wide range of multi-sensor time\nseries classification tasks shows that DOMINO achieves on average 2.04% higher\naccuracy than state-of-the-art (SOTA) DNN-based domain generalization\ntechniques, and delivers 7.83x faster training and 26.94x faster inference.\nMore importantly, DOMINO performs notably better when learning from partially\nlabeled and highly imbalanced data, providing 10.93x higher robustness against\nhardware noises than SOTA DNNs.\n","authors":["Junyao Wang","Luke Chen","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2308.03295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02360v2","updated":"2023-08-07T04:32:21Z","published":"2023-08-04T14:52:22Z","title":"Intensity-free Integral-based Learning of Marked Temporal Point\n  Processes","summary":"  In the marked temporal point processes (MTPP), a core problem is to\nparameterize the conditional joint PDF (probability distribution function)\n$p^*(m,t)$ for inter-event time $t$ and mark $m$, conditioned on the history.\nThe majority of existing studies predefine intensity functions. Their utility\nis challenged by specifying the intensity function's proper form, which is\ncritical to balance expressiveness and processing efficiency. Recently, there\nare studies moving away from predefining the intensity function -- one models\n$p^*(t)$ and $p^*(m)$ separately, while the other focuses on temporal point\nprocesses (TPPs), which do not consider marks. This study aims to develop\nhigh-fidelity $p^*(m,t)$ for discrete events where the event marks are either\ncategorical or numeric in a multi-dimensional continuous space. We propose a\nsolution framework IFIB (\\underline{I}ntensity-\\underline{f}ree\n\\underline{I}ntegral-\\underline{b}ased process) that models conditional joint\nPDF $p^*(m,t)$ directly without intensity functions. It remarkably simplifies\nthe process to compel the essential mathematical restrictions. We show the\ndesired properties of IFIB and the superior experimental results of IFIB on\nreal-world and synthetic datasets. The code is available at\n\\url{https://github.com/StepinSilence/IFIB}.\n","authors":["Sishun Liu","Ke Deng","Xiuzhen Zhang","Yongli Ren"],"pdf_url":"https://arxiv.org/pdf/2308.02360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03291v1","updated":"2023-08-07T04:20:38Z","published":"2023-08-07T04:20:38Z","title":"SynJax: Structured Probability Distributions for JAX","summary":"  The development of deep learning software libraries enabled significant\nprogress in the field by allowing users to focus on modeling, while letting the\nlibrary to take care of the tedious and time-consuming task of optimizing\nexecution for modern hardware accelerators. However, this has benefited only\nparticular types of deep learning models, such as Transformers, whose\nprimitives map easily to the vectorized computation. The models that explicitly\naccount for structured objects, such as trees and segmentations, did not\nbenefit equally because they require custom algorithms that are difficult to\nimplement in a vectorized form.\n  SynJax directly addresses this problem by providing an efficient vectorized\nimplementation of inference algorithms for structured distributions covering\nalignment, tagging, segmentation, constituency trees and spanning trees. With\nSynJax we can build large-scale differentiable models that explicitly model\nstructure in the data. The code is available at\nhttps://github.com/deepmind/synjax.\n","authors":["Miloš Stanojević","Laurent Sartran"],"pdf_url":"https://arxiv.org/pdf/2308.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03290v1","updated":"2023-08-07T04:17:19Z","published":"2023-08-07T04:17:19Z","title":"FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization\n  Search","summary":"  Quantization has become a mainstream compression technique for reducing model\nsize, computational requirements, and energy consumption for modern deep neural\nnetworks (DNNs). With the improved numerical support in recent hardware,\nincluding multiple variants of integer and floating point, mixed-precision\nquantization has become necessary to achieve high-quality results with low\nmodel cost. Prior mixed-precision quantization methods have performed a\npost-training quantization search, which compromises on accuracy, or a\ndifferentiable quantization search, which leads to high memory usage from\nbranching. Therefore, we propose the first one-shot mixed-precision\nquantization search that eliminates the need for retraining in both integer and\nlow-precision floating point models. We evaluate our floating-point and integer\nquantization search (FLIQS) on multiple convolutional networks and vision\ntransformer models to discover Pareto-optimal models. Our approach discovers\nmodels that improve upon uniform precision, manual mixed-precision, and recent\ninteger quantization search methods. With the proposed integer quantization\nsearch, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and\nResNet-50 by 0.90% points with equivalent model cost over previous methods.\nAdditionally, for the first time, we explore a novel mixed-precision\nfloating-point search and improve MobileNetV2 by up to 0.98% points compared to\nprior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously\nsearch a joint quantization and neural architecture space and improve the\nImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2\nsearch space.\n","authors":["Jordan Dotzel","Gang Wu","Andrew Li","Muhammad Umar","Yun Ni","Mohamed S. Abdelfattah","Zhiru Zhang","Liqun Cheng","Martin G. Dixon","Norman P. Jouppi","Quoc V. Le","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.03290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15306v2","updated":"2023-08-07T04:07:06Z","published":"2022-06-30T14:24:32Z","title":"Transfer Learning with Deep Tabular Models","summary":"  Recent work on deep learning for tabular data demonstrates the strong\nperformance of deep tabular models, often bridging the gap between gradient\nboosted decision trees and neural networks. Accuracy aside, a major advantage\nof neural models is that they learn reusable features and are easily fine-tuned\nin new domains. This property is often exploited in computer vision and natural\nlanguage applications, where transfer learning is indispensable when\ntask-specific training data is scarce. In this work, we demonstrate that\nupstream data gives tabular neural networks a decisive advantage over widely\nused GBDT models. We propose a realistic medical diagnosis benchmark for\ntabular transfer learning, and we present a how-to guide for using upstream\ndata to boost performance with a variety of tabular neural network\narchitectures. Finally, we propose a pseudo-feature method for cases where the\nupstream and downstream feature sets differ, a tabular-specific problem\nwidespread in real-world applications. Our code is available at\nhttps://github.com/LevinRoman/tabular-transfer-learning .\n","authors":["Roman Levin","Valeriia Cherepanova","Avi Schwarzschild","Arpit Bansal","C. Bayan Bruss","Tom Goldstein","Andrew Gordon Wilson","Micah Goldblum"],"pdf_url":"https://arxiv.org/pdf/2206.15306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03283v1","updated":"2023-08-07T04:00:13Z","published":"2023-08-07T04:00:13Z","title":"High-rate discretely-modulated continuous-variable quantum key\n  distribution using quantum machine learning","summary":"  We propose a high-rate scheme for discretely-modulated continuous-variable\nquantum key distribution (DM CVQKD) using quantum machine learning\ntechnologies, which divides the whole CVQKD system into three parts, i.e., the\ninitialization part that is used for training and estimating quantum\nclassifier, the prediction part that is used for generating highly correlated\nraw keys, and the data-postprocessing part that generates the final secret key\nstring shared by Alice and Bob. To this end, a low-complexity quantum k-nearest\nneighbor (QkNN) classifier is designed for predicting the lossy\ndiscretely-modulated coherent states (DMCSs) at Bob's side. The performance of\nthe proposed QkNN-based CVQKD especially in terms of machine learning metrics\nand complexity is analyzed, and its theoretical security is proved by using\nsemi-definite program (SDP) method. Numerical simulation shows that the secret\nkey rate of our proposed scheme is explicitly superior to the existing DM CVQKD\nprotocols, and it can be further enhanced with the increase of modulation\nvariance.\n","authors":["Qin Liao","Jieyu Liu","Anqi Huang","Lei Huang","Zhuoying Fei","Xiquan Fu"],"pdf_url":"https://arxiv.org/pdf/2308.03283v1.pdf","comment":"18 pages, 17 figures"},{"id":"http://arxiv.org/abs/2212.09201v2","updated":"2023-08-07T03:33:28Z","published":"2022-12-19T00:42:21Z","title":"Spectral Regularized Kernel Two-Sample Tests","summary":"  Over the last decade, an approach that has gained a lot of popularity to\ntackle non-parametric testing problems on general (i.e., non-Euclidean) domains\nis based on the notion of reproducing kernel Hilbert space (RKHS) embedding of\nprobability distributions. The main goal of our work is to understand the\noptimality of two-sample tests constructed based on this approach. First, we\nshow that the popular MMD (maximum mean discrepancy) two-sample test is not\noptimal in terms of the separation boundary measured in Hellinger distance.\nSecond, we propose a modification to the MMD test based on spectral\nregularization by taking into account the covariance information (which is not\ncaptured by the MMD test) and prove the proposed test to be minimax optimal\nwith a smaller separation boundary than that achieved by the MMD test. Third,\nwe propose an adaptive version of the above test which involves a data-driven\nstrategy to choose the regularization parameter and show the adaptive test to\nbe almost minimax optimal up to a logarithmic factor. Moreover, our results\nhold for the permutation variant of the test where the test threshold is chosen\nelegantly through the permutation of the samples. Through numerical experiments\non synthetic and real-world data, we demonstrate the superior performance of\nthe proposed test in comparison to the MMD test.\n","authors":["Omar Hagrass","Bharath K. Sriperumbudur","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2212.09201v2.pdf","comment":"63 pages"},{"id":"http://arxiv.org/abs/2308.03274v1","updated":"2023-08-07T03:32:39Z","published":"2023-08-07T03:32:39Z","title":"DSformer: A Double Sampling Transformer for Multivariate Time Series\n  Long-term Prediction","summary":"  Multivariate time series long-term prediction, which aims to predict the\nchange of data in a long time, can provide references for decision-making.\nAlthough transformer-based models have made progress in this field, they\nusually do not make full use of three features of multivariate time series:\nglobal information, local information, and variables correlation. To\neffectively mine the above three features and establish a high-precision\nprediction model, we propose a double sampling transformer (DSformer), which\nconsists of the double sampling (DS) block and the temporal variable attention\n(TVA) block. Firstly, the DS block employs down sampling and piecewise sampling\nto transform the original series into feature vectors that focus on global\ninformation and local information respectively. Then, TVA block uses temporal\nattention and variable attention to mine these feature vectors from different\ndimensions and extract key information. Finally, based on a parallel structure,\nDSformer uses multiple TVA blocks to mine and integrate different features\nobtained from DS blocks respectively. The integrated feature information is\npassed to the generative decoder based on a multi-layer perceptron to realize\nmultivariate time series long-term prediction. Experimental results on nine\nreal-world datasets show that DSformer can outperform eight existing baselines.\n","authors":["Chengqing Yu","Fei Wang","Zezhi Shao","Tao Sun","Lin Wu","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03274v1.pdf","comment":"Accepted by CIKM 2023 (FULL paper)"},{"id":"http://arxiv.org/abs/2103.00676v2","updated":"2023-08-07T03:25:37Z","published":"2021-03-01T01:00:09Z","title":"Token-Modification Adversarial Attacks for Natural Language Processing:\n  A Survey","summary":"  There are now many adversarial attacks for natural language processing\nsystems. Of these, a vast majority achieve success by modifying individual\ndocument tokens, which we call here a token-modification attack. Each\ntoken-modification attack is defined by a specific combination of fundamental\ncomponents, such as a constraint on the adversary or a particular search\nalgorithm. Motivated by this observation, we survey existing token-modification\nattacks and extract the components of each. We use an attack-independent\nframework to structure our survey which results in an effective categorisation\nof the field and an easy comparison of components. This survey aims to guide\nnew researchers to this field and spark further research into individual attack\ncomponents.\n","authors":["Tom Roth","Yansong Gao","Alsharif Abuadbba","Surya Nepal","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2103.00676v2.pdf","comment":"Version 2: updated"},{"id":"http://arxiv.org/abs/2308.03271v1","updated":"2023-08-07T03:23:46Z","published":"2023-08-07T03:23:46Z","title":"Local Structure-aware Graph Contrastive Representation Learning","summary":"  Traditional Graph Neural Network (GNN), as a graph representation learning\nmethod, is constrained by label information. However, Graph Contrastive\nLearning (GCL) methods, which tackle the label problem effectively, mainly\nfocus on the feature information of the global graph or small subgraph\nstructure (e.g., the first-order neighborhood). In the paper, we propose a\nLocal Structure-aware Graph Contrastive representation Learning method (LS-GCL)\nto model the structural information of nodes from multiple views. Specifically,\nwe construct the semantic subgraphs that are not limited to the first-order\nneighbors. For the local view, the semantic subgraph of each target node is\ninput into a shared GNN encoder to obtain the target node embeddings at the\nsubgraph-level. Then, we use a pooling function to generate the subgraph-level\ngraph embeddings. For the global view, considering the original graph preserves\nindispensable semantic information of nodes, we leverage the shared GNN encoder\nto learn the target node embeddings at the global graph-level. The proposed\nLS-GCL model is optimized to maximize the common information among similar\ninstances at three various perspectives through a multi-level contrastive loss\nfunction. Experimental results on five datasets illustrate that our method\noutperforms state-of-the-art graph representation learning approaches for both\nnode classification and link prediction tasks.\n","authors":["Kai Yang","Yuan Liu","Zijuan Zhao","Peijin Ding","Wenqian Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.03271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03269v1","updated":"2023-08-07T03:19:59Z","published":"2023-08-07T03:19:59Z","title":"Simple Rule Injection for ComplEx Embeddings","summary":"  Recent works in neural knowledge graph inference attempt to combine logic\nrules with knowledge graph embeddings to benefit from prior knowledge. However,\nthey usually cannot avoid rule grounding, and injecting a diverse set of rules\nhas still not been thoroughly explored. In this work, we propose InjEx, a\nmechanism to inject multiple types of rules through simple constraints, which\ncapture definite Horn rules. To start, we theoretically prove that InjEx can\ninject such rules. Next, to demonstrate that InjEx infuses interpretable prior\nknowledge into the embedding space, we evaluate InjEx on both the knowledge\ngraph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.\nOur experimental results reveal that InjEx outperforms both baseline KGC models\nas well as specialized few-shot models while maintaining its scalability and\nefficiency.\n","authors":["Haodi Ma","Anthony Colas","Yuejie Wang","Ali Sadeghian","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02394v2","updated":"2023-08-07T03:07:59Z","published":"2023-05-03T19:29:26Z","title":"Defending against Insertion-based Textual Backdoor Attacks via\n  Attribution","summary":"  Textual backdoor attack, as a novel attack model, has been shown to be\neffective in adding a backdoor to the model during training. Defending against\nsuch backdoor attacks has become urgent and important. In this paper, we\npropose AttDef, an efficient attribution-based pipeline to defend against two\ninsertion-based poisoning attacks, BadNL and InSent. Specifically, we regard\nthe tokens with larger attribution scores as potential triggers since larger\nattribution words contribute more to the false prediction results and therefore\nare more likely to be poison triggers. Additionally, we further utilize an\nexternal pre-trained language model to distinguish whether input is poisoned or\nnot. We show that our proposed method can generalize sufficiently well in two\ncommon attack scenarios (poisoning training data and testing data), which\nconsistently improves previous methods. For instance, AttDef can successfully\nmitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%\n(3.99% up) under pre-training and post-training attack defense respectively,\nachieving the new state-of-the-art performance on prediction recovery over four\nbenchmark datasets.\n","authors":["Jiazhao Li","Zhuofeng Wu","Wei Ping","Chaowei Xiao","V. G. Vinod Vydiswaran"],"pdf_url":"https://arxiv.org/pdf/2305.02394v2.pdf","comment":"Findings of ACL 2023. Camera-ready version"},{"id":"http://arxiv.org/abs/2212.08254v2","updated":"2023-08-07T03:00:41Z","published":"2022-12-16T02:52:37Z","title":"RepQ-ViT: Scale Reparameterization for Post-Training Quantization of\n  Vision Transformers","summary":"  Post-training quantization (PTQ), which only requires a tiny dataset for\ncalibration without end-to-end retraining, is a light and practical model\ncompression technique. Recently, several PTQ schemes for vision transformers\n(ViTs) have been presented; unfortunately, they typically suffer from\nnon-trivial accuracy degradation, especially in low-bit cases. In this paper,\nwe propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale\nreparameterization, to address the above issues. RepQ-ViT decouples the\nquantization and inference processes, where the former employs complex\nquantizers and the latter employs scale-reparameterized simplified quantizers.\nThis ensures both accurate quantization and efficient inference, which\ndistinguishes it from existing approaches that sacrifice quantization\nperformance to meet the target hardware. More specifically, we focus on two\ncomponents with extreme distributions: post-LayerNorm activations with severe\ninter-channel variation and post-Softmax activations with power-law features,\nand initially apply channel-wise quantization and log$\\sqrt{2}$ quantization,\nrespectively. Then, we reparameterize the scales to hardware-friendly\nlayer-wise quantization and log2 quantization for inference, with only slight\naccuracy or computational costs. Extensive experiments are conducted on\nmultiple vision tasks with different model variants, proving that RepQ-ViT,\nwithout hyperparameters and expensive reconstruction procedures, can outperform\nexisting strong baselines and encouragingly improve the accuracy of 4-bit PTQ\nof ViTs to a usable level. Code is available at\nhttps://github.com/zkkli/RepQ-ViT.\n","authors":["Zhikai Li","Junrui Xiao","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2212.08254v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02180v2","updated":"2023-08-07T02:53:06Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n  Study in Oncology","summary":"  Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zhang","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v2.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n  (MLHC) 2023"},{"id":"http://arxiv.org/abs/2308.03260v1","updated":"2023-08-07T02:42:21Z","published":"2023-08-07T02:42:21Z","title":"Exploring Different Time-series-Transformer (TST) Architectures: A Case\n  Study in Battery Life Prediction for Electric Vehicles (EVs)","summary":"  In recent years, battery technology for electric vehicles (EVs) has been a\nmajor focus, with a significant emphasis on developing new battery materials\nand chemistries. However, accurately predicting key battery parameters, such as\nstate-of-charge (SOC) and temperature, remains a challenge for constructing\nadvanced battery management systems (BMS). Existing battery models do not\ncomprehensively cover all parameters affecting battery performance, including\nnon-battery-related factors like ambient temperature, cabin temperature,\nelevation, and regenerative braking during EV operation. Due to the difficulty\nof incorporating these auxiliary parameters into traditional models, a\ndata-driven approach is suggested. Time-series-transformers (TSTs), leveraging\nmultiheaded attention and parallelization-friendly architecture, are explored\nalongside LSTM models. Novel TST architectures, including encoder TST + decoder\nLSTM and a hybrid TST-LSTM, are also developed and compared against existing\nmodels. A dataset comprising 72 driving trips in a BMW i3 (60 Ah) is used to\naddress battery life prediction in EVs, aiming to create accurate TST models\nthat incorporate environmental, battery, vehicle driving, and heating circuit\ndata to predict SOC and battery temperature for future time steps.\n","authors":["Niranjan Sitapure","Atharva Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2308.03260v1.pdf","comment":"13 pages and 7 figures"},{"id":"http://arxiv.org/abs/2308.03259v1","updated":"2023-08-07T02:37:02Z","published":"2023-08-07T02:37:02Z","title":"Optimal Approximation and Learning Rates for Deep Convolutional Neural\n  Networks","summary":"  This paper focuses on approximation and learning performance analysis for\ndeep convolutional neural networks with zero-padding and max-pooling. We prove\nthat, to approximate $r$-smooth function, the approximation rates of deep\nconvolutional neural networks with depth $L$ are of order $ (L^2/\\log\nL)^{-2r/d} $, which is optimal up to a logarithmic factor. Furthermore, we\ndeduce almost optimal learning rates for implementing empirical risk\nminimization over deep convolutional neural networks.\n","authors":["Shao-Bo Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03259v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2301.01470v5","updated":"2023-08-07T02:06:09Z","published":"2023-01-04T07:16:46Z","title":"Model Parameter Identification via a Hyperparameter Optimization Scheme\n  for Autonomous Racing Systems","summary":"  In this letter, we propose a model parameter identification method via a\nhyperparameter optimization scheme (MI-HPO). Our method adopts an efficient\nexplore-exploit strategy to identify the parameters of dynamic models in a\ndata-driven optimization manner. We utilize our method for model parameter\nidentification of the AV-21, a full-scaled autonomous race vehicle. We then\nincorporate the optimized parameters for the design of model-based planning and\ncontrol systems of our platform. In experiments, MI-HPO exhibits more than 13\ntimes faster convergence than traditional parameter identification methods.\nFurthermore, the parametric models learned via MI-HPO demonstrate good fitness\nto the given datasets and show generalization ability in unseen dynamic\nscenarios. We further conduct extensive field tests to validate our model-based\nsystem, demonstrating stable obstacle avoidance and high-speed driving up to\n217 km/h at the Indianapolis Motor Speedway and Las Vegas Motor Speedway. The\nsource code for our work and videos of the tests are available at\nhttps://github.com/hynkis/MI-HPO.\n","authors":["Hyunki Seong","Chanyoung Chung","David Hyunchul Shim"],"pdf_url":"https://arxiv.org/pdf/2301.01470v5.pdf","comment":"6 pages, 8 figures. Published in IEEE Control Systems Letters (L-CSS)"},{"id":"http://arxiv.org/abs/2304.06833v3","updated":"2023-08-07T01:41:25Z","published":"2023-04-13T21:54:53Z","title":"Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus\n  Sample Average Approximation: A Stochastic Dominance Perspective","summary":"  In data-driven stochastic optimization, model parameters of the underlying\ndistribution need to be estimated from data in addition to the optimization\ntask. Recent literature considers integrating the estimation and optimization\nprocesses by selecting model parameters that lead to the best empirical\nobjective performance. This integrated approach, which we call\nintegrated-estimation-optimization (IEO), can be readily shown to outperform\nsimple estimate-then-optimize (ETO) when the model is misspecified. In this\npaper, we show that a reverse behavior appears when the model class is\nwell-specified and there is sufficient data. Specifically, for a general class\nof nonlinear stochastic optimization problems, we show that simple ETO\noutperforms IEO asymptotically when the model class covers the ground truth, in\nthe strong sense of stochastic dominance of the regret. Namely, the entire\ndistribution of the regret, not only its mean or other moments, is always\nbetter for ETO compared to IEO. Our results also apply to constrained,\ncontextual optimization problems where the decision depends on observed\nfeatures. Whenever applicable, we also demonstrate how standard sample average\napproximation (SAA) performs the worst when the model class is well-specified\nin terms of regret, and best when it is misspecified. Finally, we provide\nexperimental results to support our theoretical comparisons and illustrate when\nour insights hold in finite-sample regimes and under various degrees of\nmisspecification.\n","authors":["Adam N. Elmachtoub","Henry Lam","Haofeng Zhang","Yunfan Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.06833v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03243v1","updated":"2023-08-07T01:41:21Z","published":"2023-08-07T01:41:21Z","title":"Unsupervised Adversarial Detection without Extra Model: Training Loss\n  Should Change","summary":"  Adversarial robustness poses a critical challenge in the deployment of deep\nlearning models for real-world applications. Traditional approaches to\nadversarial training and supervised detection rely on prior knowledge of attack\ntypes and access to labeled training data, which is often impractical. Existing\nunsupervised adversarial detection methods identify whether the target model\nworks properly, but they suffer from bad accuracies owing to the use of common\ncross-entropy training loss, which relies on unnecessary features and\nstrengthens adversarial attacks. We propose new training losses to reduce\nuseless features and the corresponding detection method without prior knowledge\nof adversarial attacks. The detection rate (true positive rate) against all\ngiven white-box attacks is above 93.9% except for attacks without limits\n(DF($\\infty$)), while the false positive rate is barely 2.5%. The proposed\nmethod works well in all tested attack types and the false positive rates are\neven better than the methods good at certain types.\n","authors":["Chien Cheng Chyou","Hung-Ting Su","Winston H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2308.03243v1.pdf","comment":"AdvML in ICML 2023\n  code:https://github.com/CycleBooster/Unsupervised-adversarial-detection-without-extra-model"},{"id":"http://arxiv.org/abs/2308.03239v1","updated":"2023-08-07T01:32:09Z","published":"2023-08-07T01:32:09Z","title":"Asynchronous Decentralized Q-Learning: Two Timescale Analysis By\n  Persistence","summary":"  Non-stationarity is a fundamental challenge in multi-agent reinforcement\nlearning (MARL), where agents update their behaviour as they learn. Many\ntheoretical advances in MARL avoid the challenge of non-stationarity by\ncoordinating the policy updates of agents in various ways, including\nsynchronizing times at which agents are allowed to revise their policies.\nSynchronization enables analysis of many MARL algorithms via multi-timescale\nmethods, but such synchrony is infeasible in many decentralized applications.\nIn this paper, we study an asynchronous variant of the decentralized Q-learning\nalgorithm, a recent MARL algorithm for stochastic games. We provide sufficient\nconditions under which the asynchronous algorithm drives play to equilibrium\nwith high probability. Our solution utilizes constant learning rates in the\nQ-factor update, which we show to be critical for relaxing the synchrony\nassumptions of earlier work. Our analysis also applies to asynchronous\ngeneralizations of a number of other algorithms from the regret testing\ntradition, whose performance is analyzed by multi-timescale methods that study\nMarkov chains obtained via policy update dynamics. This work extends the\napplicability of the decentralized Q-learning algorithm and its relatives to\nsettings in which parameters are selected in an independent manner, and tames\nnon-stationarity without imposing the coordination assumptions of prior work.\n","authors":["Bora Yongacoglu","Gürdal Arslan","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2308.03239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03236v1","updated":"2023-08-07T01:25:10Z","published":"2023-08-07T01:25:10Z","title":"G-Mix: A Generalized Mixup Learning Framework Towards Flat Minima","summary":"  Deep neural networks (DNNs) have demonstrated promising results in various\ncomplex tasks. However, current DNNs encounter challenges with\nover-parameterization, especially when there is limited training data\navailable. To enhance the generalization capability of DNNs, the Mixup\ntechnique has gained popularity. Nevertheless, it still produces suboptimal\noutcomes. Inspired by the successful Sharpness-Aware Minimization (SAM)\napproach, which establishes a connection between the sharpness of the training\nloss landscape and model generalization, we propose a new learning framework\ncalled Generalized-Mixup, which combines the strengths of Mixup and SAM for\ntraining DNN models. The theoretical analysis provided demonstrates how the\ndeveloped G-Mix framework enhances generalization. Additionally, to further\noptimize DNN performance with the G-Mix framework, we introduce two novel\nalgorithms: Binary G-Mix and Decomposed G-Mix. These algorithms partition the\ntraining data into two subsets based on the sharpness-sensitivity of each\nexample to address the issue of \"manifold intrusion\" in Mixup. Both theoretical\nexplanations and experimental results reveal that the proposed BG-Mix and\nDG-Mix algorithms further enhance model generalization across multiple datasets\nand models, achieving state-of-the-art performance.\n","authors":["Xingyu Li","Bo Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03236v1.pdf","comment":"19 pages, 23 figures"},{"id":"http://arxiv.org/abs/2212.12294v2","updated":"2023-08-07T01:21:19Z","published":"2022-12-23T12:51:42Z","title":"FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos","summary":"  Neural fields, also known as coordinate-based or implicit neural\nrepresentations, have shown a remarkable capability of representing,\ngenerating, and manipulating various forms of signals. For video\nrepresentations, however, mapping pixel-wise coordinates to RGB colors has\nshown relatively low compression performance and slow convergence and inference\nspeed. Frame-wise video representation, which maps a temporal coordinate to its\nentire frame, has recently emerged as an alternative method to represent\nvideos, improving compression rates and encoding speed. While promising, it has\nstill failed to reach the performance of state-of-the-art video compression\nalgorithms. In this work, we propose FFNeRV, a novel method for incorporating\nflow information into frame-wise representations to exploit the temporal\nredundancy across the frames in videos inspired by the standard video codecs.\nFurthermore, we introduce a fully convolutional architecture, enabled by\none-dimensional temporal grids, improving the continuity of spatial features.\nExperimental results show that FFNeRV yields the best performance for video\ncompression and frame interpolation among the methods using frame-wise\nrepresentations or neural fields. To reduce the model size even further, we\ndevise a more compact convolutional architecture using the group and pointwise\nconvolutions. With model compression techniques, including quantization-aware\ntraining and entropy coding, FFNeRV outperforms widely-used standard video\ncodecs (H.264 and HEVC) and performs on par with state-of-the-art video\ncompression algorithms.\n","authors":["Joo Chan Lee","Daniel Rho","Jong Hwan Ko","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2212.12294v2.pdf","comment":"Our project page including code is available at\n  https://maincold2.github.io/ffnerv/"},{"id":"http://arxiv.org/abs/2206.02659v5","updated":"2023-08-07T01:20:01Z","published":"2022-06-06T14:52:46Z","title":"Robust Fine-Tuning of Deep Neural Networks with Hessian-based\n  Generalization Guarantees","summary":"  We consider fine-tuning a pretrained deep neural network on a target task. We\nstudy the generalization properties of fine-tuning to understand the problem of\noverfitting, which has often been observed (e.g., when the target dataset is\nsmall or when the training labels are noisy). Existing generalization measures\nfor deep networks depend on notions such as distance from the initialization\n(i.e., the pretrained network) of the fine-tuned model and noise stability\nproperties of deep networks. This paper identifies a Hessian-based distance\nmeasure through PAC-Bayesian analysis, which is shown to correlate well with\nobserved generalization gaps of fine-tuned models. Theoretically, we prove\nHessian distance-based generalization bounds for fine-tuned models. We also\ndescribe an extended study of fine-tuning against label noise, where\noverfitting is against a critical problem; We present an algorithm and a\ngeneralization error guarantee for this algorithm under a class conditional\nindependent noise model. Empirically, we observe that the Hessian-based\ndistance measure can match the scale of the observed generalization gap of\nfine-tuned models in practice. We also test our algorithm on several image\nclassification tasks with noisy training labels, showing notable gains over\nprior methods, and the Hessian distance measure of the fine-tuned model\ndecreases substantially.\n","authors":["Haotian Ju","Dongyue Li","Hongyang R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.02659v5.pdf","comment":"37 pages. Appeared in ICML 2022"},{"id":"http://arxiv.org/abs/2308.03235v1","updated":"2023-08-07T01:10:50Z","published":"2023-08-07T01:10:50Z","title":"Analysis of the Evolution of Advanced Transformer-Based Language Models:\n  Experiments on Opinion Mining","summary":"  Opinion mining, also known as sentiment analysis, is a subfield of natural\nlanguage processing (NLP) that focuses on identifying and extracting subjective\ninformation in textual material. This can include determining the overall\nsentiment of a piece of text (e.g., positive or negative), as well as\nidentifying specific emotions or opinions expressed in the text, that involves\nthe use of advanced machine and deep learning techniques. Recently,\ntransformer-based language models make this task of human emotion analysis\nintuitive, thanks to the attention mechanism and parallel computation. These\nadvantages make such models very powerful on linguistic tasks, unlike recurrent\nneural networks that spend a lot of time on sequential processing, making them\nprone to fail when it comes to processing long text. The scope of our paper\naims to study the behaviour of the cutting-edge Transformer-based language\nmodels on opinion mining and provide a high-level comparison between them to\nhighlight their key particularities. Additionally, our comparative study shows\nleads and paves the way for production engineers regarding the approach to\nfocus on and is useful for researchers as it provides guidelines for future\nresearch subjects.\n","authors":["Nour Eddine Zekaoui","Siham Yousfi","Maryem Rhanoui","Mounia Mikram"],"pdf_url":"https://arxiv.org/pdf/2308.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03231v1","updated":"2023-08-07T00:30:29Z","published":"2023-08-07T00:30:29Z","title":"Imbalanced Large Graph Learning Framework for FPGA Logic Elements\n  Packing Prediction","summary":"  Packing is a required step in a typical FPGA CAD flow. It has high impacts to\nthe performance of FPGA placement and routing. Early prediction of packing\nresults can guide design optimization and expedite design closure. In this\nwork, we propose an imbalanced large graph learning framework, ImLG, for\nprediction of whether logic elements will be packed after placement.\nSpecifically, we propose dedicated feature extraction and feature aggregation\nmethods to enhance the node representation learning of circuit graphs. With\nimbalanced distribution of packed and unpacked logic elements, we further\npropose techniques such as graph oversampling and mini-batch training for this\nimbalanced learning task in large circuit graphs. Experimental results\ndemonstrate that our framework can improve the F1 score by 42.82% compared to\nthe most recent Gaussian-based prediction method. Physical design results show\nthat the proposed method can assist the placer in improving routed wirelength\nby 0.93% and SLICE occupation by 0.89%.\n","authors":["Zhixiong Di","Runzhe Tao","Lin Chen","Qiang Wu","Yibo Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03230v1","updated":"2023-08-07T00:14:46Z","published":"2023-08-07T00:14:46Z","title":"Tractability of approximation by general shallow networks","summary":"  In this paper, we present a sharper version of the results in the paper\nDimension independent bounds for general shallow networks; Neural Networks,\n\\textbf{123} (2020), 142-152. Let $\\mathbb{X}$ and $\\mathbb{Y}$ be compact\nmetric spaces. We consider approximation of functions of the form $\nx\\mapsto\\int_{\\mathbb{Y}} G( x, y)d\\tau( y)$, $ x\\in\\mathbb{X}$, by\n$G$-networks of the form $ x\\mapsto \\sum_{k=1}^n a_kG( x, y_k)$, $ y_1,\\cdots,\ny_n\\in\\mathbb{Y}$, $a_1,\\cdots, a_n\\in\\mathbb{R}$. Defining the dimensions of\n$\\mathbb{X}$ and $\\mathbb{Y}$ in terms of covering numbers, we obtain dimension\nindependent bounds on the degree of approximation in terms of $n$, where also\nthe constants involved are all dependent at most polynomially on the\ndimensions. Applications include approximation by power rectified linear unit\nnetworks, zonal function networks, certain radial basis function networks as\nwell as the important problem of function extension to higher dimensional\nspaces.\n","authors":["Hrushikesh Mhaskar","Tong Mao"],"pdf_url":"https://arxiv.org/pdf/2308.03230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03960v1","updated":"2023-08-07T23:52:03Z","published":"2023-08-07T23:52:03Z","title":"Amortized Global Search for Efficient Preliminary Trajectory Design with\n  Deep Generative Models","summary":"  Preliminary trajectory design is a global search problem that seeks multiple\nqualitatively different solutions to a trajectory optimization problem. Due to\nits high dimensionality and non-convexity, and the frequent adjustment of\nproblem parameters, the global search becomes computationally demanding. In\nthis paper, we exploit the clustering structure in the solutions and propose an\namortized global search (AmorGS) framework. We use deep generative models to\npredict trajectory solutions that share similar structures with previously\nsolved problems, which accelerates the global search for unseen parameter\nvalues. Our method is evaluated using De Jong's 5th function and a low-thrust\ncircular restricted three-body problem.\n","authors":["Anjian Li","Amlan Sinha","Ryne Beeson"],"pdf_url":"https://arxiv.org/pdf/2308.03960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03956v1","updated":"2023-08-07T23:46:14Z","published":"2023-08-07T23:46:14Z","title":"Fixed Inter-Neuron Covariability Induces Adversarial Robustness","summary":"  The vulnerability to adversarial perturbations is a major flaw of Deep Neural\nNetworks (DNNs) that raises question about their reliability when in real-world\nscenarios. On the other hand, human perception, which DNNs are supposed to\nemulate, is highly robust to such perturbations, indicating that there may be\ncertain features of the human perception that make it robust but are not\nrepresented in the current class of DNNs. One such feature is that the activity\nof biological neurons is correlated and the structure of this correlation tends\nto be rather rigid over long spans of times, even if it hampers performance and\nlearning. We hypothesize that integrating such constraints on the activations\nof a DNN would improve its adversarial robustness, and, to test this\nhypothesis, we have developed the Self-Consistent Activation (SCA) layer, which\ncomprises of neurons whose activations are consistent with each other, as they\nconform to a fixed, but learned, covariability pattern. When evaluated on image\nand sound recognition tasks, the models with a SCA layer achieved high\naccuracy, and exhibited significantly greater robustness than multi-layer\nperceptron models to state-of-the-art Auto-PGD adversarial attacks\n\\textit{without being trained on adversarially perturbed data\n","authors":["Muhammad Ahmed Shah","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2308.03956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03953v1","updated":"2023-08-07T23:44:35Z","published":"2023-08-07T23:44:35Z","title":"PMU measurements based short-term voltage stability assessment of power\n  systems via deep transfer learning","summary":"  Deep learning has emerged as an effective solution for addressing the\nchallenges of short-term voltage stability assessment (STVSA) in power systems.\nHowever, existing deep learning-based STVSA approaches face limitations in\nadapting to topological changes, sample labeling, and handling small datasets.\nTo overcome these challenges, this paper proposes a novel phasor measurement\nunit (PMU) measurements-based STVSA method by using deep transfer learning. The\nmethod leverages the real-time dynamic information captured by PMUs to create\nan initial dataset. It employs temporal ensembling for sample labeling and\nutilizes least squares generative adversarial networks (LSGAN) for data\naugmentation, enabling effective deep learning on small-scale datasets.\nAdditionally, the method enhances adaptability to topological changes by\nexploring connections between different faults. Experimental results on the\nIEEE 39-bus test system demonstrate that the proposed method improves model\nevaluation accuracy by approximately 20% through transfer learning, exhibiting\nstrong adaptability to topological changes. Leveraging the self-attention\nmechanism of the Transformer model, this approach offers significant advantages\nover shallow learning methods and other deep learning-based approaches.\n","authors":["Yang Li","Shitu Zhang","Yuanzheng Li","Jiting Cao","Shuyue Jia"],"pdf_url":"https://arxiv.org/pdf/2308.03953v1.pdf","comment":"Accepted by IEEE Transactions on Instrumentation & Measurement"},{"id":"http://arxiv.org/abs/2308.03945v1","updated":"2023-08-07T23:27:20Z","published":"2023-08-07T23:27:20Z","title":"The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning\n  with Transformers","summary":"  Federated learning (FL) addresses data privacy concerns by enabling\ncollaborative training of AI models across distributed data owners. Wide\nadoption of FL faces the fundamental challenges of data heterogeneity and the\nlarge scale of data owners involved. In this paper, we investigate the prospect\nof Transformer-based FL models for achieving generalization and personalization\nin this setting. We conduct extensive comparative experiments involving FL with\nTransformers, ResNet, and personalized ResNet-based FL approaches under various\nscenarios. These experiments consider varying numbers of data owners to\ndemonstrate Transformers' advantages over deep neural networks in large-scale\nheterogeneous FL tasks. In addition, we analyze the superior performance of\nTransformers by comparing the Centered Kernel Alignment (CKA) representation\nsimilarity across different layers and FL models to gain insight into the\nreasons behind their promising capabilities.\n","authors":["Yulan Gao","Hao Sun","Zengxiang Li","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03944v1","updated":"2023-08-07T23:19:34Z","published":"2023-08-07T23:19:34Z","title":"GraPhSyM: Graph Physical Synthesis Model","summary":"  In this work, we introduce GraPhSyM, a Graph Attention Network (GATv2) model\nfor fast and accurate estimation of post-physical synthesis circuit delay and\narea metrics from pre-physical synthesis circuit netlists. Once trained,\nGraPhSyM provides accurate visibility of final design metrics to early EDA\nstages, such as logic synthesis, without running the slow physical synthesis\nflow, enabling global co-optimization across stages. Additionally, the swift\nand precise feedback provided by GraPhSym is instrumental for\nmachine-learning-based EDA optimization frameworks. Given a gate-level netlist\nof a circuit represented as a graph, GraPhSyM utilizes graph structure,\nconnectivity, and electrical property features to predict the impact of\nphysical synthesis transformations such as buffer insertion and gate sizing.\nWhen trained on a dataset of 6000 prefix adder designs synthesized at an\naggressive delay target, GraPhSyM can accurately predict the post-synthesis\ndelay (98.3%) and area (96.1%) metrics of unseen adders with a fast 0.22s\ninference time. Furthermore, we illustrate the compositionality of GraPhSyM by\nemploying the model trained on a fixed delay target to accurately anticipate\npost-synthesis metrics at a variety of unseen delay targets. Lastly, we report\npromising generalization capabilities of the GraPhSyM model when it is\nevaluated on circuits different from the adders it was exclusively trained on.\nThe results show the potential for GraPhSyM to serve as a powerful tool for\nadvanced optimization techniques and as an oracle for EDA machine learning\nframeworks.\n","authors":["Ahmed Agiza","Rajarshi Roy","Teodor Dumitru Ene","Saad Godil","Sherief Reda","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.03944v1.pdf","comment":"Accepted at ICCAD'23"},{"id":"http://arxiv.org/abs/2308.00824v2","updated":"2023-08-07T22:47:33Z","published":"2023-08-01T20:22:53Z","title":"An Exact Kernel Equivalence for Finite Classification Models","summary":"  We explore the equivalence between neural networks and kernel methods by\nderiving the first exact representation of any finite-size parametric\nclassification model trained with gradient descent as a kernel machine. We\ncompare our exact representation to the well-known Neural Tangent Kernel (NTK)\nand discuss approximation error relative to the NTK and other non-exact path\nkernel formulations. We experimentally demonstrate that the kernel can be\ncomputed for realistic networks up to machine precision. We use this exact\nkernel to show that our theoretical contribution can provide useful insights\ninto the predictions made by neural networks, particularly the way in which\nthey generalize.\n","authors":["Brian Bell","Michael Geyer","David Glickenstein","Amanda Fernandez","Juston Moore"],"pdf_url":"https://arxiv.org/pdf/2308.00824v2.pdf","comment":"TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in\n  Appendix"},{"id":"http://arxiv.org/abs/2204.01248v2","updated":"2023-08-07T22:21:24Z","published":"2022-04-04T05:27:40Z","title":"Differentiable Rendering for Synthetic Aperture Radar Imagery","summary":"  There is rising interest in differentiable rendering, which allows explicitly\nmodeling geometric priors and constraints in optimization pipelines using\nfirst-order methods such as backpropagation. Incorporating such domain\nknowledge can lead to deep neural networks that are trained more robustly and\nwith limited data, as well as the capability to solve ill-posed inverse\nproblems. Existing efforts in differentiable rendering have focused on imagery\nfrom electro-optical sensors, particularly conventional RGB-imagery. In this\nwork, we propose an approach for differentiable rendering of Synthetic Aperture\nRadar (SAR) imagery, which combines methods from 3D computer graphics with\nneural rendering. We demonstrate the approach on the inverse graphics problem\nof 3D Object Reconstruction from limited SAR imagery using high-fidelity\nsimulated SAR data.\n","authors":["Michael Wilmanski","Jonathan Tamir"],"pdf_url":"https://arxiv.org/pdf/2204.01248v2.pdf","comment":"This version of the manuscript is an updated preprint which has been\n  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but\n  has not yet been published or processed by IEEE"},{"id":"http://arxiv.org/abs/2308.03928v1","updated":"2023-08-07T22:12:48Z","published":"2023-08-07T22:12:48Z","title":"Optimizing the switching operation in monoclonal antibody production:\n  Economic MPC and reinforcement learning","summary":"  Monoclonal antibodies (mAbs) have emerged as indispensable assets in\nmedicine, and are currently at the forefront of biopharmaceutical product\ndevelopment. However, the growing market demand and the substantial doses\nrequired for mAb clinical treatments necessitate significant progress in its\nlarge-scale production. Most of the processes for industrial mAb production\nrely on batch operations, which result in significant downtime. The shift\ntowards a fully continuous and integrated manufacturing process holds the\npotential to boost product yield and quality, while eliminating the extra\nexpenses associated with storing intermediate products. The integrated\ncontinuous mAb production process can be divided into the upstream and\ndownstream processes. One crucial aspect that ensures the continuity of the\nintegrated process is the switching of the capture columns, which are typically\nchromatography columns operated in a fed-batch manner downstream. Due to the\ndiscrete nature of the switching operation, advanced process control algorithms\nsuch as economic MPC (EMPC) are computationally difficult to implement. This is\nbecause an integer nonlinear program (INLP) needs to be solved online at each\nsampling time. This paper introduces two computationally-efficient approaches\nfor EMPC implementation, namely, a sigmoid function approximation approach and\na rectified linear unit (ReLU) approximation approach. It also explores the\napplication of deep reinforcement learning (DRL). These three methods are\ncompared to the traditional switching approach which is based on a 1% product\nbreakthrough rule and which involves no optimization.\n","authors":["Sandra A. Obiri","Song Bo","Bernard T. Agyeman","Benjamin Decardi-Nelson","Jinfeng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10331v3","updated":"2023-08-07T22:07:04Z","published":"2023-02-20T21:54:25Z","title":"Causal Razors","summary":"  When performing causal discovery, assumptions have to be made on how the true\ncausal mechanism corresponds to the underlying joint probability distribution.\nThese assumptions are labeled as causal razors in this work. We review numerous\ncausal razors that appeared in the literature, and offer a comprehensive\nlogical comparison of them. In particular, we scrutinize an unpopular causal\nrazor, namely parameter minimality, in multinomial causal models and its\nlogical relations with other well-studied causal razors. Our logical result\nposes a dilemma in selecting a reasonable scoring criterion for score-based\ncasual search algorithms.\n","authors":["Wai-yin Lam"],"pdf_url":"https://arxiv.org/pdf/2302.10331v3.pdf","comment":"29 pages for the main paper. 14 pages for the supplementary materials"},{"id":"http://arxiv.org/abs/2308.02013v2","updated":"2023-08-07T21:34:44Z","published":"2023-08-03T20:08:23Z","title":"Federated Representation Learning for Automatic Speech Recognition","summary":"  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge\ndevices to learn collaboratively without sharing data. Edge devices like Alexa\nand Siri are prospective sources of unlabeled audio data that can be tapped to\nlearn robust audio representations. In this work, we bring Self-supervised\nLearning (SSL) and FL together to learn representations for Automatic Speech\nRecognition respecting data privacy constraints. We use the speaker and chapter\ninformation in the unlabeled speech dataset, Libri-Light, to simulate non-IID\nspeaker-siloed data distributions and pre-train an LSTM encoder with the\nContrastive Predictive Coding framework with FedSGD. We show that the\npre-trained ASR encoder in FL performs as well as a centrally pre-trained model\nand produces an improvement of 12-15% (WER) compared to no pre-training. We\nfurther adapt the federated pre-trained models to a new language, French, and\nshow a 20% (WER) improvement over no pre-training.\n","authors":["Guruprasad V Ramesh","Gopinath Chennupati","Milind Rao","Anit Kumar Sahu","Ariya Rastrow","Jasha Droppo"],"pdf_url":"https://arxiv.org/pdf/2308.02013v2.pdf","comment":"Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy\n  in Speech Communication, 2023"},{"id":"http://arxiv.org/abs/2308.03915v1","updated":"2023-08-07T21:20:24Z","published":"2023-08-07T21:20:24Z","title":"Predicting and explaining nonlinear material response using deep\n  Physically Guided Neural Networks with Internal Variables","summary":"  Nonlinear materials are often difficult to model with classical state model\ntheory because they have a complex and sometimes inaccurate physical and\nmathematical description or we simply do not know how to describe such\nmaterials in terms of relations between external and internal variables. In\nmany disciplines, Neural Network methods have arisen as powerful tools to\nidentify very complex and non-linear correlations. In this work, we use the\nvery recently developed concept of Physically Guided Neural Networks with\nInternal Variables (PGNNIV) to discover constitutive laws using a model-free\napproach and training solely with measured force-displacement data. PGNNIVs\nmake a particular use of the physics of the problem to enforce constraints on\nspecific hidden layers and are able to make predictions without internal\nvariable data. We demonstrate that PGNNIVs are capable of predicting both\ninternal and external variables under unseen load scenarios, regardless of the\nnature of the material considered (linear, with hardening or softening behavior\nand hyperelastic), unravelling the constitutive law of the material hence\nexplaining its nature altogether, placing the method in what is known as\neXplainable Artificial Intelligence (XAI).\n","authors":["Javier Orera-Echeverria","Jacobo Ayensa-Jiménez","Manuel Doblare"],"pdf_url":"https://arxiv.org/pdf/2308.03915v1.pdf","comment":"Main text: 25 pages, 6 figures. Appendices: 13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2112.04629v4","updated":"2023-08-07T21:06:18Z","published":"2021-12-09T00:08:09Z","title":"Transferability Properties of Graph Neural Networks","summary":"  Graph neural networks (GNNs) are composed of layers consisting of graph\nconvolutions and pointwise nonlinearities. Due to their invariance and\nstability properties, GNNs are provably successful at learning representations\nfrom data supported on moderate-scale graphs. However, they are difficult to\nlearn on large-scale graphs. In this paper, we study the problem of training\nGNNs on graphs of moderate size and transferring them to large-scale graphs. We\nuse graph limits called graphons to define limit objects for graph filters and\nGNNs -- graphon filters and graphon neural networks (WNNs) -- which we\ninterpret as generative models for graph filters and GNNs. We then show that\ngraphon filters and WNNs can be approximated by graph filters and GNNs sampled\nfrom them on weighted and stochastic graphs. Because the error of these\napproximations can be upper bounded, by a triangle inequality argument we can\nfurther bound the error of transferring a graph filter or a GNN across graphs.\nOur results show that (i) the transference error decreases with the graph size,\nand (ii) that graph filters have a transferability-discriminability tradeoff\nthat in GNNs is alleviated by the scattering behavior of the nonlinearity.\nThese findings are demonstrated empirically in a movie recommendation problem\nand in a decentralized control task.\n","authors":["Luana Ruiz","Luiz F. O. Chamon","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2112.04629v4.pdf","comment":"IEEE TSP"},{"id":"http://arxiv.org/abs/2308.03908v1","updated":"2023-08-07T20:50:54Z","published":"2023-08-07T20:50:54Z","title":"ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings\n  for Video Action Recognition","summary":"  Video Action Recognition (VAR) is a challenging task due to its inherent\ncomplexities. Though different approaches have been explored in the literature,\ndesigning a unified framework to recognize a large number of human actions is\nstill a challenging problem. Recently, Multi-Modal Learning (MML) has\ndemonstrated promising results in this domain. In literature, 2D skeleton or\npose modality has often been used for this task, either independently or in\nconjunction with the visual information (RGB modality) present in videos.\nHowever, the combination of pose, visual information, and text attributes has\nnot been explored yet, though text and pose attributes independently have been\nproven to be effective in numerous computer vision tasks. In this paper, we\npresent the first pose augmented Vision-language model (VLM) for VAR. Notably,\nour scheme achieves an accuracy of 92.81% and 73.02% on two popular human video\naction recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even\nwithout any video data pre-training, and an accuracy of 96.11% and 75.75% after\nkinetics pre-training.\n","authors":["Soumyabrata Chaudhuri","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.03908v1.pdf","comment":"7 pages, 3 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2308.03907v1","updated":"2023-08-07T20:50:48Z","published":"2023-08-07T20:50:48Z","title":"Advancements In Crowd-Monitoring System: A Comprehensive Analysis of\n  Systematic Approaches and Automation Algorithms: State-of-The-Art","summary":"  Growing apprehensions surrounding public safety have captured the attention\nof numerous governments and security agencies across the globe. These entities\nare increasingly acknowledging the imperative need for reliable and secure\ncrowd-monitoring systems to address these concerns. Effectively managing human\ngatherings necessitates proactive measures to prevent unforeseen events or\ncomplications, ensuring a safe and well-coordinated environment. The scarcity\nof research focusing on crowd monitoring systems and their security\nimplications has given rise to a burgeoning area of investigation, exploring\npotential approaches to safeguard human congregations effectively. Crowd\nmonitoring systems depend on a bifurcated approach, encompassing vision-based\nand non-vision-based technologies. An in-depth analysis of these two\nmethodologies will be conducted in this research. The efficacy of these\napproaches is contingent upon the specific environment and temporal context in\nwhich they are deployed, as they each offer distinct advantages. This paper\nendeavors to present an in-depth analysis of the recent incorporation of\nartificial intelligence (AI) algorithms and models into automated systems,\nemphasizing their contemporary applications and effectiveness in various\ncontexts.\n","authors":["Mohammed Ameen","Richard Stone"],"pdf_url":"https://arxiv.org/pdf/2308.03907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03905v1","updated":"2023-08-07T20:43:42Z","published":"2023-08-07T20:43:42Z","title":"Intelligent Assistant Language Understanding On Device","summary":"  It has recently become feasible to run personal digital assistants on phones\nand other personal devices. In this paper we describe a design for a natural\nlanguage understanding system that runs on device. In comparison to a\nserver-based assistant, this system is more private, more reliable, faster,\nmore expressive, and more accurate. We describe what led to key choices about\narchitecture and technologies. For example, some approaches in the dialog\nsystems literature are difficult to maintain over time in a deployment setting.\nWe hope that sharing learnings from our practical experiences may help inform\nfuture work in the research community.\n","authors":["Cecilia Aas","Hisham Abdelsalam","Irina Belousova","Shruti Bhargava","Jianpeng Cheng","Robert Daland","Joris Driesen","Federico Flego","Tristan Guigue","Anders Johannsen","Partha Lal","Jiarui Lu","Joel Ruben Antony Moniz","Nathan Perkins","Dhivya Piraviperumal","Stephen Pulman","Diarmuid Ó Séaghdha","David Q. Sun","John Torr","Marco Del Vecchio","Jay Wacker","Jason D. Williams","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03904v1","updated":"2023-08-07T20:41:19Z","published":"2023-08-07T20:41:19Z","title":"On genuine invariance learning without weight-tying","summary":"  In this paper, we investigate properties and limitations of invariance\nlearned by neural networks from the data compared to the genuine invariance\nachieved through invariant weight-tying. To do so, we adopt a group theoretical\nperspective and analyze invariance learning in neural networks without\nweight-tying constraints. We demonstrate that even when a network learns to\ncorrectly classify samples on a group orbit, the underlying decision-making in\nsuch a model does not attain genuine invariance. Instead, learned invariance is\nstrongly conditioned on the input data, rendering it unreliable if the input\ndistribution shifts. We next demonstrate how to guide invariance learning\ntoward genuine invariance by regularizing the invariance of a model at the\ntraining. To this end, we propose several metrics to quantify learned\ninvariance: (i) predictive distribution invariance, (ii) logit invariance, and\n(iii) saliency invariance similarity. We show that the invariance learned with\nthe invariance error regularization closely reassembles the genuine invariance\nof weight-tying models and reliably holds even under a severe input\ndistribution shift. Closer analysis of the learned invariance also reveals the\nspectral decay phenomenon, when a network chooses to achieve the invariance to\na specific transformation group by reducing the sensitivity to any input\nperturbation.\n","authors":["Artem Moskalev","Anna Sepliarskaia","Erik J. Bekkers","Arnold Smeulders"],"pdf_url":"https://arxiv.org/pdf/2308.03904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03901v1","updated":"2023-08-07T20:28:22Z","published":"2023-08-07T20:28:22Z","title":"FLIPS: Federated Learning using Intelligent Participant Selection","summary":"  This paper presents the design and implementation of FLIPS, a middleware\nsystem to manage data and participant heterogeneity in federated learning (FL)\ntraining workloads. In particular, we examine the benefits of label\ndistribution clustering on participant selection in federated learning. FLIPS\nclusters parties involved in an FL training job based on the label distribution\nof their data apriori, and during FL training, ensures that each cluster is\nequitably represented in the participants selected. FLIPS can support the most\ncommon FL algorithms, including FedAvg, FedProx, FedDyn, FedOpt and FedYogi. To\nmanage platform heterogeneity and dynamic resource availability, FLIPS\nincorporates a straggler management mechanism to handle changing capacities in\ndistributed, smart community applications. Privacy of label distributions,\nclustering and participant selection is ensured through a trusted execution\nenvironment (TEE). Our comprehensive empirical evaluation compares FLIPS with\nrandom participant selection, as well as two other \"smart\" selection mechanisms\n- Oort and gradient clustering using two real-world datasets, two different\nnon-IID distributions and three common FL algorithms (FedYogi, FedProx and\nFedAvg). We demonstrate that FLIPS significantly improves convergence,\nachieving higher accuracy by 17 - 20 % with 20 - 60 % lower communication\ncosts, and these benefits endure in the presence of straggler participants.\n","authors":["Rahul Atul Bhope","K. R. Jayaram","Nalini Venkatasubramanian","Ashish Verma","Gegi Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.03901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08496v2","updated":"2023-08-07T20:27:19Z","published":"2023-07-17T13:59:07Z","title":"Can We Trust Race Prediction?","summary":"  In the absence of sensitive race and ethnicity data, researchers, regulators,\nand firms alike turn to proxies. In this paper, I train a Bidirectional Long\nShort-Term Memory (BiLSTM) model on a novel dataset of voter registration data\nfrom all 50 US states and create an ensemble that achieves up to 36.8% higher\nout of sample (OOS) F1 scores than the best performing machine learning models\nin the literature. Additionally, I construct the most comprehensive database of\nfirst and surname distributions in the US in order to improve the coverage and\naccuracy of Bayesian Improved Surname Geocoding (BISG) and Bayesian Improved\nFirstname Surname Geocoding (BIFSG). Finally, I provide the first high-quality\nbenchmark dataset in order to fairly compare existing models and aid future\nmodel developers.\n","authors":["Cangyuan Li"],"pdf_url":"https://arxiv.org/pdf/2307.08496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13452v3","updated":"2023-08-07T19:57:38Z","published":"2023-05-22T19:52:08Z","title":"Measuring and Modeling Physical Intrinsic Motivation","summary":"  Humans are interactive agents driven to seek out situations with interesting\nphysical dynamics. Here we formalize the functional form of physical intrinsic\nmotivation. We first collect ratings of how interesting humans find a variety\nof physics scenarios. We then model human interestingness responses by\nimplementing various hypotheses of intrinsic motivation including models that\nrely on simple scene features to models that depend on forward physics\nprediction. We find that the single best predictor of human responses is\nadversarial reward, a model derived from physical prediction loss. We also find\nthat simple scene feature models do not generalize their prediction of human\nresponses across all scenarios. Finally, linearly combining the adversarial\nmodel with the number of collisions in a scene leads to the greatest\nimprovement in predictivity of human responses, suggesting humans are driven\ntowards scenarios that result in high information gain and physical activity.\n","authors":["Julio Martinez","Felix Binder","Haoliang Wang","Nick Haber","Judith Fan","Daniel L. K. Yamins"],"pdf_url":"https://arxiv.org/pdf/2305.13452v3.pdf","comment":"6 pages, 5 figures, accepted to CogSci 2023 with full paper\n  publication in the proceedings"},{"id":"http://arxiv.org/abs/2305.02640v3","updated":"2023-08-07T19:55:10Z","published":"2023-05-04T08:20:37Z","title":"Towards Causal Representation Learning and Deconfounding from Indefinite\n  Data","summary":"  We redefine causal data from two novel perspectives: the number of causal\nskeletons and the dimension of causal variables, thereby proposing three data\nparadigms. Among them, the indefinite data (like dialogues or video sources) is\ncharacterized by multi-skeleton structures and multi-value variables. Multi\nskeletons induce low sample utilization, and multi values induce incapability\nof the distribution assumption, both leading to the fact that learning causal\nrepresentation from indefinite data is, as of yet, largely unexplored. We\ndesign the causal strength variational model to settle down these two problems.\nSpecifically, we leverage the causal strength instead of independent noise as\nthe latent variable to construct evidence lower bound. By this design ethos,\nThe causal strengths of different skeletons are regarded as a distribution and\ncan be expressed as a single-valued causal graph matrix. Moreover, considering\nthe latent confounders, we disentangle the causal graph G into two relation\nsubgraphs O and C. O contains pure relations between observed variables, while\nC represents the relations from latent variables to observed variables. We\nimplement the above designs as a dynamic variational inference model, tailored\nto learn causal representation from indefinite data under latent confounding.\nFinally, we conduct comprehensive experiments on synthetic and real-world data\nto demonstrate the effectiveness of our method.\n","authors":["Hang Chen","Xinyu Yang","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.02640v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03892v1","updated":"2023-08-07T19:51:10Z","published":"2023-08-07T19:51:10Z","title":"Scalable and Equitable Math Problem Solving Strategy Prediction in Big\n  Educational Data","summary":"  Understanding a student's problem-solving strategy can have a significant\nimpact on effective math learning using Intelligent Tutoring Systems (ITSs) and\nAdaptive Instructional Systems (AISs). For instance, the ITS/AIS can better\npersonalize itself to correct specific misconceptions that are indicated by\nincorrect strategies, specific problems can be designed to improve strategies\nand frustration can be minimized by adapting to a student's natural way of\nthinking rather than trying to fit a standard strategy for all. While it may be\npossible for human experts to identify strategies manually in classroom\nsettings with sufficient student interaction, it is not possible to scale this\nup to big data. Therefore, we leverage advances in Machine Learning and AI\nmethods to perform scalable strategy prediction that is also fair to students\nat all skill levels. Specifically, we develop an embedding called MVec where we\nlearn a representation based on the mastery of students. We then cluster these\nembeddings with a non-parametric clustering method where we progressively learn\nclusters such that we group together instances that have approximately\nsymmetrical strategies. The strategy prediction model is trained on instances\nsampled from these clusters. This ensures that we train the model over diverse\nstrategies and also that strategies from a particular group do not bias the DNN\nmodel, thus allowing it to optimize its parameters over all groups. Using real\nworld large-scale student interaction datasets from MATHia, we implement our\napproach using transformers and Node2Vec for learning the mastery embeddings\nand LSTMs for predicting strategies. We show that our approach can scale up to\nachieve high accuracy by training on a small sample of a large dataset and also\nhas predictive equality, i.e., it can predict strategies equally well for\nlearners at diverse skill levels.\n","authors":["Anup Shakya","Vasile Rus","Deepak Venugopal"],"pdf_url":"https://arxiv.org/pdf/2308.03892v1.pdf","comment":"12 pages, 7 figures Published as a full paper in the 16th\n  International Conference on Educational Data Mining 2023"},{"id":"http://arxiv.org/abs/2301.00790v3","updated":"2023-08-07T19:44:14Z","published":"2022-12-30T17:19:00Z","title":"Online learning techniques for prediction of temporal tabular datasets\n  with regime changes","summary":"  The application of deep learning to non-stationary temporal datasets can lead\nto overfitted models that underperform under regime changes. In this work, we\npropose a modular machine learning pipeline for ranking predictions on temporal\npanel datasets which is robust under regime changes. The modularity of the\npipeline allows the use of different models, including Gradient Boosting\nDecision Trees (GBDTs) and Neural Networks, with and without feature\nengineering. We evaluate our framework on financial data for stock portfolio\nprediction, and find that GBDT models with dropout display high performance,\nrobustness and generalisability with reduced complexity and computational cost.\nWe then demonstrate how online learning techniques, which require no retraining\nof models, can be used post-prediction to enhance the results. First, we show\nthat dynamic feature projection improves robustness by reducing drawdown in\nregime changes. Second, we demonstrate that dynamical model ensembling based on\nselection of models with good recent performance leads to improved Sharpe and\nCalmar ratios of out-of-sample predictions. We also evaluate the robustness of\nour pipeline across different data splits and random seeds with good\nreproducibility.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2301.00790v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03883v1","updated":"2023-08-07T19:26:09Z","published":"2023-08-07T19:26:09Z","title":"Generative Benchmark Creation for Table Union Search","summary":"  Data management has traditionally relied on synthetic data generators to\ngenerate structured benchmarks, like the TPC suite, where we can control\nimportant parameters like data size and its distribution precisely. These\nbenchmarks were central to the success and adoption of database management\nsystems. But more and more, data management problems are of a semantic nature.\nAn important example is finding tables that can be unioned. While any two\ntables with the same cardinality can be unioned, table union search is the\nproblem of finding tables whose union is semantically coherent. Semantic\nproblems cannot be benchmarked using synthetic data. Our current methods for\ncreating benchmarks involve the manual curation and labeling of real data.\nThese methods are not robust or scalable and perhaps more importantly, it is\nnot clear how robust the created benchmarks are. We propose to use generative\nAI models to create structured data benchmarks for table union search. We\npresent a novel method for using generative models to create tables with\nspecified properties. Using this method, we create a new benchmark containing\npairs of tables that are both unionable and non-unionable but related. We\nthoroughly evaluate recent existing table union search methods over existing\nbenchmarks and our new benchmark. We also present and evaluate a new table\nsearch methods based on recent large language models over all benchmarks. We\nshow that the new benchmark is more challenging for all methods than\nhand-curated benchmarks, specifically, the top-performing method achieves a\nMean Average Precision of around 60%, over 30% less than its performance on\nexisting manually created benchmarks. We examine why this is the case and show\nthat the new benchmark permits more detailed analysis of methods, including a\nstudy of both false positives and false negatives that were not possible with\nexisting benchmarks.\n","authors":["Koyena Pal","Aamod Khatiwada","Roee Shraga","Renée J. Miller"],"pdf_url":"https://arxiv.org/pdf/2308.03883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03882v1","updated":"2023-08-07T19:24:47Z","published":"2023-08-07T19:24:47Z","title":"Exploiting Generalization in Offline Reinforcement Learning via Unseen\n  State Augmentations","summary":"  Offline reinforcement learning (RL) methods strike a balance between\nexploration and exploitation by conservative value estimation -- penalizing\nvalues of unseen states and actions. Model-free methods penalize values at all\nunseen actions, while model-based methods are able to further exploit unseen\nstates via model rollouts. However, such methods are handicapped in their\nability to find unseen states far away from the available offline data due to\ntwo factors -- (a) very short rollout horizons in models due to cascading model\nerrors, and (b) model rollouts originating solely from states observed in\noffline data. We relax the second assumption and present a novel unseen state\naugmentation strategy to allow exploitation of unseen states where the learned\nmodel and value estimates generalize. Our strategy finds unseen states by\nvalue-informed perturbations of seen states followed by filtering out states\nwith epistemic uncertainty estimates too high (high error) or too low (too\nsimilar to seen data). We observe improved performance in several offline RL\ntasks and find that our augmentation strategy consistently leads to overall\nlower average dataset Q-value estimates i.e. more conservative Q-value\nestimates than a baseline.\n","authors":["Nirbhay Modhe","Qiaozi Gao","Ashwin Kalyan","Dhruv Batra","Govind Thattai","Gaurav Sukhatme"],"pdf_url":"https://arxiv.org/pdf/2308.03882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03873v1","updated":"2023-08-07T18:50:57Z","published":"2023-08-07T18:50:57Z","title":"Evaluating and Explaining Large Language Models for Code Using Syntactic\n  Structures","summary":"  Large Language Models (LLMs) for code are a family of high-parameter,\ntransformer-based neural networks pre-trained on massive datasets of both\nnatural and programming languages. These models are rapidly being employed in\ncommercial AI-based developer tools, such as GitHub CoPilot. However, measuring\nand explaining their effectiveness on programming tasks is a challenging\nproposition, given their size and complexity. The methods for evaluating and\nexplaining LLMs for code are inextricably linked. That is, in order to explain\na model's predictions, they must be reliably mapped to fine-grained,\nunderstandable concepts. Once this mapping is achieved, new methods for\ndetailed model evaluations are possible. However, most current explainability\ntechniques and evaluation benchmarks focus on model robustness or individual\ntask performance, as opposed to interpreting model predictions.\n  To this end, this paper introduces ASTxplainer, an explainability method\nspecific to LLMs for code that enables both new methods for LLM evaluation and\nvisualizations of LLM predictions that aid end-users in understanding model\npredictions. At its core, ASTxplainer provides an automated method for aligning\ntoken predictions with AST nodes, by extracting and aggregating normalized\nmodel logits within AST structures. To demonstrate the practical benefit of\nASTxplainer, we illustrate the insights that our framework can provide by\nperforming an empirical evaluation on 12 popular LLMs for code using a curated\ndataset of the most popular GitHub projects. Additionally, we perform a user\nstudy examining the usefulness of an ASTxplainer-derived visualization of model\npredictions aimed at enabling model users to explain predictions. The results\nof these studies illustrate the potential for ASTxplainer to provide insights\ninto LLM effectiveness, and aid end-users in understanding predictions.\n","authors":["David N Palacio","Alejandro Velasco","Daniel Rodriguez-Cardenas","Kevin Moran","Denys Poshyvanyk"],"pdf_url":"https://arxiv.org/pdf/2308.03873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03869v1","updated":"2023-08-07T18:40:13Z","published":"2023-08-07T18:40:13Z","title":"Semantic Equivalence of e-Commerce Queries","summary":"  Search query variation poses a challenge in e-commerce search, as equivalent\nsearch intents can be expressed through different queries with surface-level\ndifferences. This paper introduces a framework to recognize and leverage query\nequivalence to enhance searcher and business outcomes. The proposed approach\naddresses three key problems: mapping queries to vector representations of\nsearch intent, identifying nearest neighbor queries expressing equivalent or\nsimilar intent, and optimizing for user or business objectives. The framework\nutilizes both surface similarity and behavioral similarity to determine query\nequivalence. Surface similarity involves canonicalizing queries based on word\ninflection, word order, compounding, and noise words. Behavioral similarity\nleverages historical search behavior to generate vector representations of\nquery intent. An offline process is used to train a sentence similarity model,\nwhile an online nearest neighbor approach supports processing of unseen\nqueries. Experimental evaluations demonstrate the effectiveness of the proposed\napproach, outperforming popular sentence transformer models and achieving a\nPearson correlation of 0.85 for query similarity. The results highlight the\npotential of leveraging historical behavior data and training models to\nrecognize and utilize query equivalence in e-commerce search, leading to\nimproved user experiences and business outcomes. Further advancements and\nbenchmark datasets are encouraged to facilitate the development of solutions\nfor this critical problem in the e-commerce domain.\n","authors":["Aritra Mandal","Daniel Tunkelang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03869v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP"},{"id":"http://arxiv.org/abs/2308.03854v1","updated":"2023-08-07T18:04:12Z","published":"2023-08-07T18:04:12Z","title":"Revisiting Prompt Engineering via Declarative Crowdsourcing","summary":"  Large language models (LLMs) are incredibly powerful at comprehending and\ngenerating data in the form of text, but are brittle and error-prone. There has\nbeen an advent of toolkits and recipes centered around so-called prompt\nengineering-the process of asking an LLM to do something via a series of\nprompts. However, for LLM-powered data processing workflows, in particular,\noptimizing for quality, while keeping cost bounded, is a tedious, manual\nprocess. We put forth a vision for declarative prompt engineering. We view LLMs\nlike crowd workers and leverage ideas from the declarative crowdsourcing\nliterature-including leveraging multiple prompting strategies, ensuring\ninternal consistency, and exploring hybrid-LLM-non-LLM approaches-to make\nprompt engineering a more principled process. Preliminary case studies on\nsorting, entity resolution, and imputation demonstrate the promise of our\napproach\n","authors":["Aditya G. Parameswaran","Shreya Shankar","Parth Asawa","Naman Jain","Yujie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03842v1","updated":"2023-08-07T18:00:04Z","published":"2023-08-07T18:00:04Z","title":"Search Engine and Recommendation System for the Music Industry built\n  with JinaAI","summary":"  One of the most intriguing debates regarding a novel task is the development\nof search engines and recommendation-based systems in the music industry.\nStudies have shown a drastic depression in the search engine fields, due to\nconcerning factors such as speed, accuracy and the format of data given for\nquerying. Often people face difficulty in searching for a song solely based on\nthe title, hence a solution is proposed to complete a search analysis through a\nsingle query input and is matched with the lyrics of the songs present in the\ndatabase. Hence it is essential to incorporate cutting-edge technology tools\nfor developing a user-friendly search engine. Jina AI is an MLOps framework for\nbuilding neural search engines that are utilized, in order for the user to\nobtain accurate results. Jina AI effectively helps to maintain and enhance the\nquality of performance for the search engine for the query given. An effective\nsearch engine and a recommendation system for the music industry, built with\nJinaAI.\n","authors":["Ishita Gopalakrishnan","Sanjjushri Varshini R","Ponshriharini V"],"pdf_url":"https://arxiv.org/pdf/2308.03842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03825v1","updated":"2023-08-07T16:55:20Z","published":"2023-08-07T16:55:20Z","title":"\"Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak\n  Prompts on Large Language Models","summary":"  The misuse of large language models (LLMs) has garnered significant attention\nfrom the general public and LLM vendors. In response, efforts have been made to\nalign LLMs with human values and intent use. However, a particular type of\nadversarial prompts, known as jailbreak prompt, has emerged and continuously\nevolved to bypass the safeguards and elicit harmful content from LLMs. In this\npaper, we conduct the first measurement study on jailbreak prompts in the wild,\nwith 6,387 prompts collected from four platforms over six months. Leveraging\nnatural language processing technologies and graph-based community detection\nmethods, we discover unique characteristics of jailbreak prompts and their\nmajor attack strategies, such as prompt injection and privilege escalation. We\nalso observe that jailbreak prompts increasingly shift from public platforms to\nprivate ones, posing new challenges for LLM vendors in proactive detection. To\nassess the potential harm caused by jailbreak prompts, we create a question set\ncomprising 46,800 samples across 13 forbidden scenarios. Our experiments show\nthat current LLMs and safeguards cannot adequately defend jailbreak prompts in\nall scenarios. Particularly, we identify two highly effective jailbreak prompts\nwhich achieve 0.99 attack success rates on ChatGPT (GPT-3.5) and GPT-4, and\nthey have persisted online for over 100 days. Our work sheds light on the\nsevere and evolving threat landscape of jailbreak prompts. We hope our study\ncan facilitate the research community and LLM vendors in promoting safer and\nregulated LLMs.\n","authors":["Xinyue Shen","Zeyuan Chen","Michael Backes","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03821v1","updated":"2023-08-07T15:30:02Z","published":"2023-08-07T15:30:02Z","title":"Distributionally Robust Classification on a Data Budget","summary":"  Real world uses of deep learning require predictable model behavior under\ndistribution shifts. Models such as CLIP show emergent natural distributional\nrobustness comparable to humans, but may require hundreds of millions of\ntraining samples. Can we train robust learners in a domain where data is\nlimited? To rigorously address this question, we introduce JANuS (Joint\nAnnotations and Names Set), a collection of four new training datasets with\nimages, labels, and corresponding captions, and perform a series of carefully\ncontrolled investigations of factors contributing to robustness in image\nclassification, then compare those results to findings derived from a\nlarge-scale meta-analysis. Using this approach, we show that standard ResNet-50\ntrained with the cross-entropy loss on 2.4 million image samples can attain\ncomparable robustness to a CLIP ResNet-50 trained on 400 million samples. To\nour knowledge, this is the first result showing (near) state-of-the-art\ndistributional robustness on limited data budgets. Our dataset is available at\n\\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used\nto reproduce our experiments can be found at\n\\url{https://github.com/penfever/vlhub/}.\n","authors":["Benjamin Feuer","Ameya Joshi","Minh Pham","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2308.03821v1.pdf","comment":"TMLR 2023; openreview link:\n  https://openreview.net/forum?id=D5Z2E8CNsD"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.03703v1","updated":"2023-08-07T16:22:47Z","published":"2023-08-07T16:22:47Z","title":"Video-based Person Re-identification with Long Short-Term Representation\n  Learning","summary":"  Video-based person Re-Identification (V-ReID) aims to retrieve specific\npersons from raw videos captured by non-overlapped cameras. As a fundamental\ntask, it spreads many multimedia and computer vision applications. However, due\nto the variations of persons and scenes, there are still many obstacles that\nmust be overcome for high performance. In this work, we notice that both the\nlong-term and short-term information of persons are important for robust video\nrepresentations. Thus, we propose a novel deep learning framework named Long\nShort-Term Representation Learning (LSTRL) for effective V-ReID. More\nspecifically, to extract long-term representations, we propose a\nMulti-granularity Appearance Extractor (MAE), in which four granularity\nappearances are effectively captured across multiple frames. Meanwhile, to\nextract short-term representations, we propose a Bi-direction Motion Estimator\n(BME), in which reciprocal motion information is efficiently extracted from\nconsecutive frames. The MAE and BME are plug-and-play and can be easily\ninserted into existing networks for efficient feature learning. As a result,\nthey significantly improve the feature representation ability for V-ReID.\nExtensive experiments on three widely used benchmarks show that our proposed\napproach can deliver better performances than most state-of-the-arts.\n","authors":["Xuehu Liu","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03703v1.pdf","comment":"This work is accepted by ICIG2023, including 13 pages, 5 figures and\n  5 tables. Modifications may be performed for further improvements"},{"id":"http://arxiv.org/abs/2308.03643v1","updated":"2023-08-07T14:47:45Z","published":"2023-08-07T14:47:45Z","title":"Mamba: Bringing Multi-Dimensional ABR to WebRTC","summary":"  Contemporary real-time video communication systems, such as WebRTC, use an\nadaptive bitrate (ABR) algorithm to assure high-quality and low-delay services,\ne.g., promptly adjusting video bitrate according to the instantaneous network\nbandwidth. However, target bitrate decisions in the network and bitrate control\nin the codec are typically incoordinated and simply ignoring the effect of\ninappropriate resolution and frame rate settings also leads to compromised\nresults in bitrate control, thus devastatingly deteriorating the quality of\nexperience (QoE). To tackle these challenges, Mamba, an end-to-end\nmulti-dimensional ABR algorithm is proposed, which utilizes multi-agent\nreinforcement learning (MARL) to maximize the user's QoE by adaptively and\ncollaboratively adjusting encoding factors including the quantization\nparameters (QP), resolution, and frame rate based on observed states such as\nnetwork conditions and video complexity information in a video conferencing\nsystem. We also introduce curriculum learning to improve the training\nefficiency of MARL. Both the in-lab and real-world evaluation results\ndemonstrate the remarkable efficacy of Mamba.\n","authors":["Yueheng Li","Zicheng Zhang","Hao Chen","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2308.03643v1.pdf","comment":"In Proceedings of the 31st ACM International Conference on\n  Multimedia, October 29-November 3, 2023, Ottawa, ON, Canada. ACM, New York,\n  NY, USA, 9 pages"},{"id":"http://arxiv.org/abs/2308.03475v1","updated":"2023-08-07T11:05:59Z","published":"2023-08-07T11:05:59Z","title":"COPA: Efficient Vision-Language Pre-training Through Collaborative\n  Object- and Patch-Text Alignment","summary":"  Vision-Language Pre-training (VLP) methods based on object detection enjoy\nthe rich knowledge of fine-grained object-text alignment but at the cost of\ncomputationally expensive inference. Recent Visual-Transformer (ViT)-based\napproaches circumvent this issue while struggling with long visual sequences\nwithout detailed cross-modal alignment information. This paper introduces a\nViT-based VLP technique that efficiently incorporates object information\nthrough a novel patch-text alignment mechanism. Specifically, we convert\nobject-level signals into patch-level ones and devise a Patch-Text Alignment\npre-training task (PTA) to learn a text-aware patch detector. By using\noff-the-shelf delicate object annotations in 5\\% training images, we jointly\ntrain PTA with other conventional VLP objectives in an end-to-end manner,\nbypassing the high computational cost of object detection and yielding an\neffective patch detector that accurately detects text-relevant patches, thus\nconsiderably reducing patch sequences and accelerating computation within the\nViT backbone. Our experiments on a variety of widely-used benchmarks reveal\nthat our method achieves a speedup of nearly 88\\% compared to prior VLP models\nwhile maintaining competitive or superior performance on downstream tasks with\nsimilar model size and data scale.\n","authors":["Chaoya Jiang","Haiyang Xu","Wei Ye","Qinghao Ye","Chenliang Li","Ming Yan","Bin Bi","Shikun Zhang","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03475v1.pdf","comment":"Accepted on ACM MM2023"},{"id":"http://arxiv.org/abs/2308.03463v1","updated":"2023-08-07T10:41:52Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.03463v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.07176v2","updated":"2023-08-07T10:09:21Z","published":"2023-05-11T23:12:13Z","title":"Automatic Radiology Report Generation by Learning with Increasingly Hard\n  Negatives","summary":"  Automatic radiology report generation is challenging as medical images or\nreports are usually similar to each other due to the common content of anatomy.\nThis makes a model hard to capture the uniqueness of individual images and is\nprone to producing undesired generic or mismatched reports. This situation\ncalls for learning more discriminative features that could capture even\nfine-grained mismatches between images and reports. To achieve this, this paper\nproposes a novel framework to learn discriminative image and report features by\ndistinguishing them from their closest peers, i.e., hard negatives. Especially,\nto attain more discriminative features, we gradually raise the difficulty of\nsuch a learning task by creating increasingly hard negative reports for each\nimage in the feature space during training, respectively. By treating the\nincreasingly hard negatives as auxiliary variables, we formulate this process\nas a min-max alternating optimisation problem. At each iteration, conditioned\non a given set of hard negative reports, image and report features are learned\nas usual by minimising the loss functions related to report generation. After\nthat, a new set of harder negative reports will be created by maximising a loss\nreflecting image-report alignment. By solving this optimisation, we attain a\nmodel that can generate more specific and accurate reports. It is noteworthy\nthat our framework enhances discriminative feature learning without introducing\nextra network weights. Also, in contrast to the existing way of generating hard\nnegatives, our framework extends beyond the granularity of the dataset by\ngenerating harder samples out of the training set. Experimental study on\nbenchmark datasets verifies the efficacy of our framework and shows that it can\nserve as a plug-in to readily improve existing medical report generation\nmodels.\n","authors":["Bhanu Prakash Voutharoja","Lei Wang","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.07176v2.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n  2023"},{"id":"http://arxiv.org/abs/2308.03432v1","updated":"2023-08-07T09:26:36Z","published":"2023-08-07T09:26:36Z","title":"Cuing Without Sharing: A Federated Cued Speech Recognition Framework via\n  Mutual Knowledge Distillation","summary":"  Cued Speech (CS) is a visual coding tool to encode spoken languages at the\nphonetic level, which combines lip-reading and hand gestures to effectively\nassist communication among people with hearing impairments. The Automatic CS\nRecognition (ACSR) task aims to recognize CS videos into linguistic texts,\nwhich involves both lips and hands as two distinct modalities conveying\ncomplementary information. However, the traditional centralized training\napproach poses potential privacy risks due to the use of facial and gesture\nvideos in CS data. To address this issue, we propose a new Federated Cued\nSpeech Recognition (FedCSR) framework to train an ACSR model over the\ndecentralized CS data without sharing private information. In particular, a\nmutual knowledge distillation method is proposed to maintain cross-modal\nsemantic consistency of the Non-IID CS data, which ensures learning a unified\nfeature space for both linguistic and visual information. On the server side, a\nglobally shared linguistic model is trained to capture the long-term\ndependencies in the text sentences, which is aligned with the visual\ninformation from the local clients via visual-to-linguistic distillation. On\nthe client side, the visual model of each client is trained with its own local\ndata, assisted by linguistic-to-visual distillation treating the linguistic\nmodel as the teacher. To the best of our knowledge, this is the first approach\nto consider the federated ACSR task for privacy protection. Experimental\nresults on the Chinese CS dataset with multiple cuers demonstrate that our\napproach outperforms both mainstream federated learning baselines and existing\ncentralized state-of-the-art ACSR methods, achieving 9.7% performance\nimprovement for character error rate (CER) and 15.0% for word error rate (WER).\n","authors":["Yuxuan Zhang","Lei Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03950v1","updated":"2023-08-07T23:41:55Z","published":"2023-08-07T23:41:55Z","title":"Zero-shot Skeleton-based Action Recognition via Mutual Information\n  Estimation and Maximization","summary":"  Zero-shot skeleton-based action recognition aims to recognize actions of\nunseen categories after training on data of seen categories. The key is to\nbuild the connection between visual and semantic space from seen to unseen\nclasses. Previous studies have primarily focused on encoding sequences into a\nsingular feature vector, with subsequent mapping the features to an identical\nanchor point within the embedded space. Their performance is hindered by 1) the\nignorance of the global visual/semantic distribution alignment, which results\nin a limitation to capture the true interdependence between the two spaces. 2)\nthe negligence of temporal information since the frame-wise features with rich\naction clues are directly pooled into a single feature vector. We propose a new\nzero-shot skeleton-based action recognition method via mutual information (MI)\nestimation and maximization. Specifically, 1) we maximize the MI between visual\nand semantic space for distribution alignment; 2) we leverage the temporal\ninformation for estimating the MI by encouraging MI to increase as more frames\nare observed. Extensive experiments on three large-scale skeleton action\ndatasets confirm the effectiveness of our method. Code:\nhttps://github.com/YujieOuO/SMIE.\n","authors":["Yujie Zhou","Wenwen Qiang","Anyi Rao","Ning Lin","Bing Su","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03950v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03826v1","updated":"2023-08-07T17:49:04Z","published":"2023-08-07T17:49:04Z","title":"Recurrent Multi-scale Transformer for High-Resolution Salient Object\n  Detection","summary":"  Salient Object Detection (SOD) aims to identify and segment the most\nconspicuous objects in an image or video. As an important pre-processing step,\nit has many potential applications in multimedia and vision tasks. With the\nadvance of imaging devices, SOD with high-resolution images is of great demand,\nrecently. However, traditional SOD methods are largely limited to\nlow-resolution images, making them difficult to adapt to the development of\nHigh-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no\nlarge enough datasets for training and evaluating. Besides, current HRSOD\nmethods generally produce incomplete object regions and irregular object\nboundaries. To address above issues, in this work, we first propose a new\nHRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K\nresolution. As far as we know, it is the largest dataset for the HRSOD task,\nwhich will significantly help future works in training and evaluating models.\nFurthermore, to improve the HRSOD performance, we propose a novel Recurrent\nMulti-scale Transformer (RMFormer), which recurrently utilizes shared\nTransformers and multi-scale refinement architectures. Thus, high-resolution\nsaliency maps can be generated with the guidance of lower-resolution\npredictions. Extensive experiments on both high-resolution and low-resolution\nbenchmarks show the effectiveness and superiority of the proposed framework.\nThe source code and dataset are released at:\nhttps://github.com/DrowsyMon/RMFormer.\n","authors":["Xinhao Deng","Pingping Zhang","Wei Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03826v1.pdf","comment":"This work is accepted by ACM MM2023. More modifications may be\n  performed for further improvements"}]},"2023-08-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.03228v1","updated":"2023-08-06T23:41:14Z","published":"2023-08-06T23:41:14Z","title":"Why Linguistics Will Thrive in the 21st Century: A Reply to Piantadosi\n  (2023)","summary":"  We present a critical assessment of Piantadosi's (2023) claim that \"Modern\nlanguage models refute Chomsky's approach to language,\" focusing on four main\npoints. First, despite the impressive performance and utility of large language\nmodels (LLMs), humans achieve their capacity for language after exposure to\nseveral orders of magnitude less data. The fact that young children become\ncompetent, fluent speakers of their native languages with relatively little\nexposure to them is the central mystery of language learning to which Chomsky\ninitially drew attention, and LLMs currently show little promise of solving\nthis mystery. Second, what can the artificial reveal about the natural? Put\nsimply, the implications of LLMs for our understanding of the cognitive\nstructures and mechanisms underlying language and its acquisition are like the\nimplications of airplanes for understanding how birds fly. Third, LLMs cannot\nconstitute scientific theories of language for several reasons, not least of\nwhich is that scientific theories must provide interpretable explanations, not\njust predictions. This leads to our final point: to even determine whether the\nlinguistic and cognitive capabilities of LLMs rival those of humans requires\nexplicating what humans' capacities actually are. In other words, it requires a\nseparate theory of language and cognition; generative linguistics provides\nprecisely such a theory. As such, we conclude that generative linguistics as a\nscientific discipline will remain indispensable throughout the 21st century and\nbeyond.\n","authors":["Jordan Kodner","Sarah Payne","Jeffrey Heinz"],"pdf_url":"https://arxiv.org/pdf/2308.03228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03226v1","updated":"2023-08-06T23:16:54Z","published":"2023-08-06T23:16:54Z","title":"Investigation of Self-supervised Pre-trained Models for Classification\n  of Voice Quality from Speech and Neck Surface Accelerometer Signals","summary":"  Prior studies in the automatic classification of voice quality have mainly\nstudied the use of the acoustic speech signal as input. Recently, a few studies\nhave been carried out by jointly using both speech and neck surface\naccelerometer (NSA) signals as inputs, and by extracting MFCCs and glottal\nsource features. This study examines simultaneously-recorded speech and NSA\nsignals in the classification of voice quality (breathy, modal, and pressed)\nusing features derived from three self-supervised pre-trained models\n(wav2vec2-BASE, wav2vec2-LARGE, and HuBERT) and using a SVM as well as CNNs as\nclassifiers. Furthermore, the effectiveness of the pre-trained models is\ncompared in feature extraction between glottal source waveforms and raw signal\nwaveforms for both speech and NSA inputs. Using two signal processing methods\n(quasi-closed phase (QCP) glottal inverse filtering and zero frequency\nfiltering (ZFF)), glottal source waveforms are estimated from both speech and\nNSA signals. The study has three main goals: (1) to study whether features\nderived from pre-trained models improve classification accuracy compared to\nconventional features (spectrogram, mel-spectrogram, MFCCs, i-vector, and\nx-vector), (2) to investigate which of the two modalities (speech vs. NSA) is\nmore effective in the classification task with pre-trained model-based\nfeatures, and (3) to evaluate whether the deep learning-based CNN classifier\ncan enhance the classification accuracy in comparison to the SVM classifier.\nThe results revealed that the use of the NSA input showed better classification\nperformance compared to the speech signal. Between the features, the\npre-trained model-based features showed better classification accuracies, both\nfor speech and NSA inputs compared to the conventional features. It was also\nfound that the HuBERT features performed better than the wav2vec2-BASE and\nwav2vec2-LARGE features.\n","authors":["Sudarsana Reddy Kadiri","Farhad Javanmardi","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2308.03226v1.pdf","comment":"Accepted by Computer Speech & Language"},{"id":"http://arxiv.org/abs/2308.03212v1","updated":"2023-08-06T21:23:22Z","published":"2023-08-06T21:23:22Z","title":"Average-Hard Attention Transformers are Constant-Depth Uniform Threshold\n  Circuits","summary":"  Transformers have emerged as a widely used neural network model for various\nnatural language processing tasks. Previous research explored their\nrelationship with constant-depth threshold circuits, making two assumptions:\naverage-hard attention and logarithmic precision for internal computations\nrelative to input length. Merrill et al. (2022) prove that average-hard\nattention transformers recognize languages that fall within the complexity\nclass TC0, denoting the set of languages that can be recognized by\nconstant-depth polynomial-size threshold circuits. Likewise, Merrill and\nSabharwal (2023) show that log-precision transformers recognize languages\nwithin the class of uniform TC0. This shows that both transformer models can be\nsimulated by constant-depth threshold circuits, with the latter being more\nrobust due to generating a uniform circuit family. Our paper shows that the\nfirst result can be extended to yield uniform circuits as well.\n","authors":["Lena Strobl"],"pdf_url":"https://arxiv.org/pdf/2308.03212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03188v1","updated":"2023-08-06T18:38:52Z","published":"2023-08-06T18:38:52Z","title":"Automatically Correcting Large Language Models: Surveying the landscape\n  of diverse self-correction strategies","summary":"  Large language models (LLMs) have demonstrated remarkable performance across\na wide array of NLP tasks. However, their efficacy is undermined by undesired\nand inconsistent behaviors, including hallucination, unfaithful reasoning, and\ntoxic content. A promising approach to rectify these flaws is self-correction,\nwhere the LLM itself is prompted or guided to fix problems in its own output.\nTechniques leveraging automated feedback -- either produced by the LLM itself\nor some external system -- are of particular interest as they are a promising\nway to make LLM-based solutions more practical and deployable with minimal\nhuman feedback. This paper presents a comprehensive review of this emerging\nclass of techniques. We analyze and taxonomize a wide array of recent work\nutilizing these strategies, including training-time, generation-time, and\npost-hoc correction. We also summarize the major applications of this strategy\nand conclude by discussing future directions and challenges.\n","authors":["Liangming Pan","Michael Saxon","Wenda Xu","Deepak Nathani","Xinyi Wang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03188v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2308.03151v1","updated":"2023-08-06T15:56:31Z","published":"2023-08-06T15:56:31Z","title":"Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating\n  Vision-Language Models","summary":"  Vision-language models (VLMs) have shown impressive performance in\nsubstantial downstream multi-modal tasks. However, only comparing the\nfine-tuned performance on downstream tasks leads to the poor interpretability\nof VLMs, which is adverse to their future improvement. Several prior works have\nidentified this issue and used various probing methods under a zero-shot\nsetting to detect VLMs' limitations, but they all examine VLMs using general\ndatasets instead of specialized ones. In practical applications, VLMs are\nusually applied to specific scenarios, such as e-commerce and news fields, so\nthe generalization of VLMs in specific domains should be given more attention.\nIn this paper, we comprehensively investigate the capabilities of popular VLMs\nin a specific field, the food domain. To this end, we build a food caption\ndataset, Food-500 Cap, which contains 24,700 food images with 494 categories.\nEach image is accompanied by a detailed caption, including fine-grained\nattributes of food, such as the ingredient, shape, and color. We also provide a\nculinary culture taxonomy that classifies each food category based on its\ngeographic origin in order to better analyze the performance differences of VLM\nin different regions. Experiments on our proposed datasets demonstrate that\npopular VLMs underperform in the food domain compared with their performance in\nthe general domain. Furthermore, our research reveals severe bias in VLMs'\nability to handle food items from different geographic regions. We adopt\ndiverse probing methods and evaluate nine VLMs belonging to different\narchitectures to verify the aforementioned observations. We hope that our study\nwill bring researchers' attention to VLM's limitations when applying them to\nthe domain of food or culinary cultures, and spur further investigations to\naddress this issue.\n","authors":["Zheng Ma","Mianzhi Pan","Wenhan Wu","Kanzhi Cheng","Jianbing Zhang","Shujian Huang","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03151v1.pdf","comment":"Accepted at ACM Multimedia (ACMMM) 2023"},{"id":"http://arxiv.org/abs/2308.03131v1","updated":"2023-08-06T14:49:26Z","published":"2023-08-06T14:49:26Z","title":"Towards Multiple References Era -- Addressing Data Leakage and Limited\n  Reference Diversity in NLG Evaluation","summary":"  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely\nutilized across a range of natural language generation (NLG) tasks. However,\nrecent studies have revealed a weak correlation between these matching-based\nmetrics and human evaluations, especially when compared with neural-based\nmetrics like BLEURT. In this paper, we conjecture that the performance\nbottleneck in matching-based metrics may be caused by the limited diversity of\nreferences. To address this issue, we propose to utilize \\textit{multiple\nreferences} to enhance the consistency between these metrics and human\nevaluations. Within the WMT Metrics benchmarks, we observe that the\nmulti-references F200spBLEU surpasses the conventional single-reference one by\nan accuracy improvement of 7.2\\%. Remarkably, it also exceeds the neural-based\nBERTscore by an accuracy enhancement of 3.9\\%. Moreover, we observe that the\ndata leakage issue in large language models (LLMs) can be mitigated to a large\nextent by our multi-reference metric. We release the code and data at\n\\url{https://github.com/SefaZeng/LLM-Ref}\n","authors":["Xianfeng Zeng","Yijin Liu","Fandong Meng","Jie Zho"],"pdf_url":"https://arxiv.org/pdf/2308.03131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03122v1","updated":"2023-08-06T14:09:02Z","published":"2023-08-06T14:09:02Z","title":"\"Kurosawa\": A Script Writer's Assistant","summary":"  Storytelling is the lifeline of the entertainment industry -- movies, TV\nshows, and stand-up comedies, all need stories. A good and gripping script is\nthe lifeline of storytelling and demands creativity and resource investment.\nGood scriptwriters are rare to find and often work under severe time pressure.\nConsequently, entertainment media are actively looking for automation. In this\npaper, we present an AI-based script-writing workbench called KUROSAWA which\naddresses the tasks of plot generation and script generation. Plot generation\naims to generate a coherent and creative plot (600-800 words) given a prompt\n(15-40 words). Script generation, on the other hand, generates a scene (200-500\nwords) in a screenplay format from a brief description (15-40 words). Kurosawa\nneeds data to train. We use a 4-act structure of storytelling to annotate the\nplot dataset manually. We create a dataset of 1000 manually annotated plots and\ntheir corresponding prompts/storylines and a gold-standard dataset of 1000\nscenes with four main elements -- scene headings, action lines, dialogues, and\ncharacter names -- tagged individually. We fine-tune GPT-3 with the above\ndatasets to generate plots and scenes. These plots and scenes are first\nevaluated and then used by the scriptwriters of a large and famous media\nplatform ErosNow. We release the annotated datasets and the models trained on\nthese datasets as a working benchmark for automatic movie plot and script\ngeneration.\n","authors":["Prerak Gandhi","Vishal Pramanik","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2308.03122v1.pdf","comment":"6 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.03117v1","updated":"2023-08-06T13:54:14Z","published":"2023-08-06T13:54:14Z","title":"PromptSum: Parameter-Efficient Controllable Abstractive Summarization","summary":"  Prompt tuning (PT), a parameter-efficient technique that only tunes the\nadditional prompt embeddings while keeping the backbone pre-trained language\nmodel (PLM) frozen, has shown promising results in language understanding\ntasks, especially in low-resource scenarios. However, effective prompt design\nmethods suitable for generation tasks such as summarization are still lacking.\nAt the same time, summarization guided through instructions (discrete prompts)\ncan achieve a desirable double objective of high quality and controllability in\nsummary generation. Towards a goal of strong summarization performance under\nthe triple conditions of parameter-efficiency, data-efficiency, and\ncontrollability, we introduce PromptSum, a method combining PT with a\nmulti-task objective and discrete entity prompts for abstractive summarization.\nOur model achieves competitive ROUGE results on popular abstractive\nsummarization benchmarks coupled with a strong level of controllability through\nentities, all while only tuning several orders of magnitude less parameters.\n","authors":["Mathieu Ravaut","Hailin Chen","Ruochen Zhao","Chengwei Qin","Shafiq Joty","Nancy Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03103v1","updated":"2023-08-06T12:40:58Z","published":"2023-08-06T12:40:58Z","title":"Improving Domain-Specific Retrieval by NLI Fine-Tuning","summary":"  The aim of this article is to investigate the fine-tuning potential of\nnatural language inference (NLI) data to improve information retrieval and\nranking. We demonstrate this for both English and Polish languages, using data\nfrom one of the largest Polish e-commerce sites and selected open-domain\ndatasets. We employ both monolingual and multilingual sentence encoders\nfine-tuned by a supervised method utilizing contrastive loss and NLI data. Our\nresults point to the fact that NLI fine-tuning increases the performance of the\nmodels in both tasks and both languages, with the potential to improve mono-\nand multilingual models. Finally, we investigate uniformity and alignment of\nthe embeddings to explain the effect of NLI-based fine-tuning for an\nout-of-domain use-case.\n","authors":["Roman Dušek","Aleksander Wawer","Christopher Galias","Lidia Wojciechowska"],"pdf_url":"https://arxiv.org/pdf/2308.03103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11769v2","updated":"2023-08-06T12:29:21Z","published":"2023-05-19T15:54:40Z","title":"Enhancing Vision-Language Pre-Training with Jointly Learned Questioner\n  and Dense Captioner","summary":"  Large pre-trained multimodal models have demonstrated significant success in\na range of downstream tasks, including image captioning, image-text retrieval,\nvisual question answering (VQA), etc. However, many of these methods rely on\nimage-text pairs collected from the web as pre-training data and unfortunately\noverlook the need for fine-grained feature alignment between vision and\nlanguage modalities, which requires detailed understanding of images and\nlanguage expressions. While integrating VQA and dense captioning (DC) into\npre-training can address this issue, acquiring image-question-answer as well as\nimage-location-caption triplets is challenging and time-consuming.\nAdditionally, publicly available datasets for VQA and dense captioning are\ntypically limited in scale due to manual data collection and labeling efforts.\nIn this paper, we propose a novel method called Joint QA and DC GEneration\n(JADE), which utilizes a pre-trained multimodal model and easily-crawled\nimage-text pairs to automatically generate and filter large-scale VQA and dense\ncaptioning datasets. We apply this method to the Conceptual Caption (CC3M)\ndataset to generate a new dataset called CC3M-QA-DC. Experiments show that when\nused for pre-training in a multi-task manner, CC3M-QA-DC can improve the\nperformance with various backbones on various downstream tasks. Furthermore,\nour generated CC3M-QA-DC can be combined with larger image-text datasets (e.g.,\nCC15M) and achieve competitive results compared with models using much more\ndata. Code and dataset are available at\nhttps://github.com/johncaged/OPT_Questioner.\n","authors":["Zikang Liu","Sihan Chen","Longteng Guo","Handong Li","Xingjian He","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2305.11769v2.pdf","comment":"12 pages. Accepted by ACM MM '23"},{"id":"http://arxiv.org/abs/2308.03099v1","updated":"2023-08-06T12:28:24Z","published":"2023-08-06T12:28:24Z","title":"LARCH: Large Language Model-based Automatic Readme Creation with\n  Heuristics","summary":"  Writing a readme is a crucial aspect of software development as it plays a\nvital role in managing and reusing program code. Though it is a pain point for\nmany developers, automatically creating one remains a challenge even with the\nrecent advancements in large language models (LLMs), because it requires\ngenerating abstract description from thousands of lines of code. In this demo\npaper, we show that LLMs are capable of generating a coherent and factually\ncorrect readmes if we can identify a code fragment that is representative of\nthe repository. Building upon this finding, we developed LARCH (LLM-based\nAutomatic Readme Creation with Heuristics) which leverages representative code\nidentification with heuristics and weak supervision. Through human and\nautomated evaluations, we illustrate that LARCH can generate coherent and\nfactually correct readmes in the majority of cases, outperforming a baseline\nthat does not rely on representative code identification. We have made LARCH\nopen-source and provided a cross-platform Visual Studio Code interface and\ncommand-line interface, accessible at https://github.com/hitachi-nlp/larch . A\ndemo video showcasing LARCH's capabilities is available at\nhttps://youtu.be/ZUKkh5ED-O4 .\n","authors":["Yuta Koreeda","Terufumi Morishita","Osamu Imaichi","Yasuhiro Sogawa"],"pdf_url":"https://arxiv.org/pdf/2308.03099v1.pdf","comment":"Accepted at CIKM'23 Demo (This is a submitted version before camera\n  ready)"},{"id":"http://arxiv.org/abs/2308.03098v1","updated":"2023-08-06T12:25:22Z","published":"2023-08-06T12:25:22Z","title":"System-Initiated Transitions from Chit-Chat to Task-Oriented Dialogues\n  with Transition Info Extractor and Transition Sentence Generator","summary":"  In this work, we study dialogue scenarios that start from chit-chat but\neventually switch to task-related services, and investigate how a unified\ndialogue model, which can engage in both chit-chat and task-oriented dialogues,\ntakes the initiative during the dialogue mode transition from chit-chat to\ntask-oriented in a coherent and cooperative manner. We firstly build a\n{transition info extractor} (TIE) that keeps track of the preceding chit-chat\ninteraction and detects the potential user intention to switch to a\ntask-oriented service. Meanwhile, in the unified model, a {transition sentence\ngenerator} (TSG) is extended through efficient Adapter tuning and transition\nprompt learning. When the TIE successfully finds task-related information from\nthe preceding chit-chat, such as a transition domain, then the TSG is activated\nautomatically in the unified model to initiate this transition by generating a\ntransition sentence under the guidance of transition information extracted by\nTIE. The experimental results show promising performance regarding the\nproactive transitions. We achieve an additional large improvement on TIE model\nby utilizing Conditional Random Fields (CRF). The TSG can flexibly generate\ntransition sentences while maintaining the unified capabilities of normal\nchit-chat and task-oriented response generation.\n","authors":["Ye Liu","Stefan Ultes","Wolfgang Minker","Wolfgang Maier"],"pdf_url":"https://arxiv.org/pdf/2308.03098v1.pdf","comment":"accepted by INLG 2023"},{"id":"http://arxiv.org/abs/2308.03051v1","updated":"2023-08-06T08:29:16Z","published":"2023-08-06T08:29:16Z","title":"TARJAMAT: Evaluation of Bard and ChatGPT on Machine Translation of Ten\n  Arabic Varieties","summary":"  Large language models (LLMs) finetuned to follow human instructions have\nrecently emerged as a breakthrough in AI. Models such as Google Bard and OpenAI\nChatGPT, for example, are surprisingly powerful tools for question answering,\ncode debugging, and dialogue generation. Despite the purported multilingual\nproficiency of these models, their linguistic inclusivity remains\ninsufficiently explored. Considering this constraint, we present a thorough\nassessment of Bard and ChatGPT (encompassing both GPT-3.5 and GPT-4) regarding\ntheir machine translation proficiencies across ten varieties of Arabic. Our\nevaluation covers diverse Arabic varieties such as Classical Arabic, Modern\nStandard Arabic, and several nuanced dialectal variants. Furthermore, we\nundertake a human-centric study to scrutinize the efficacy of the most recent\nmodel, Bard, in following human instructions during translation tasks. Our\nexhaustive analysis indicates that LLMs may encounter challenges with certain\nArabic dialects, particularly those for which minimal public data exists, such\nas Algerian and Mauritanian dialects. However, they exhibit satisfactory\nperformance with more prevalent dialects, albeit occasionally trailing behind\nestablished commercial systems like Google Translate. Additionally, our\nanalysis reveals a circumscribed capability of Bard in aligning with human\ninstructions in translation contexts. Collectively, our findings underscore\nthat prevailing LLMs remain far from inclusive, with only limited ability to\ncater for the linguistic and cultural intricacies of diverse communities.\n","authors":["Karima Kadaoui","Samar M. Magdy","Abdul Waheed","Md Tawkat Islam Khondaker","Ahmed Oumar El-Shangiti","El Moatez Billah Nagoudi","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2308.03051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13923v2","updated":"2023-08-06T08:06:43Z","published":"2023-04-27T02:23:47Z","title":"Retrieval-based Knowledge Augmented Vision Language Pre-training","summary":"  With the recent progress in large-scale vision and language representation\nlearning, Vision Language Pre-training (VLP) models have achieved promising\nimprovements on various multi-modal downstream tasks. Albeit powerful, these\nmodels have not fully leveraged world knowledge to their advantage. A key\nchallenge of knowledge-augmented VLP is the lack of clear connections between\nknowledge and multi-modal data. Moreover, not all knowledge present in\nimages/texts is useful, therefore prior approaches often struggle to\neffectively integrate knowledge, visual, and textual information. In this\nstudy, we propose REtrieval-based knowledge Augmented Vision Language (REAVL),\na novel knowledge-augmented pre-training framework to address the above issues.\nFor the first time, we introduce a knowledge-aware self-supervised learning\nscheme that efficiently establishes the correspondence between knowledge and\nmulti-modal data and identifies informative knowledge to improve the modeling\nof alignment and interactions between visual and textual modalities. By\nadaptively integrating informative knowledge with visual and textual\ninformation, REAVL achieves new state-of-the-art performance uniformly on\nknowledge-based vision-language understanding and multi-modal entity linking\ntasks, as well as competitive results on general vision-language tasks while\nonly using 0.2% pre-training data of the best models. Our model shows strong\nsample efficiency and effective knowledge utilization.\n","authors":["Jiahua Rao","Zifei Shan","Longpo Liu","Yao Zhou","Yuedong Yang"],"pdf_url":"https://arxiv.org/pdf/2304.13923v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.09338 by other authors"},{"id":"http://arxiv.org/abs/2308.03043v1","updated":"2023-08-06T07:59:12Z","published":"2023-08-06T07:59:12Z","title":"3D-EX : A Unified Dataset of Definitions and Dictionary Examples","summary":"  Definitions are a fundamental building block in lexicography, linguistics and\ncomputational semantics. In NLP, they have been used for retrofitting word\nembeddings or augmenting contextual representations in language models.\nHowever, lexical resources containing definitions exhibit a wide range of\nproperties, which has implications in the behaviour of models trained and\nevaluated on them. In this paper, we introduce 3D- EX , a dataset that aims to\nfill this gap by combining well-known English resources into one centralized\nknowledge repository in the form of <term, definition, example> triples. 3D- EX\nis a unified evaluation framework with carefully pre-computed\ntrain/validation/test splits to prevent memorization. We report experimental\nresults that suggest that this dataset could be effectively leveraged in\ndownstream NLP tasks. Code and data are available at\nhttps://github.com/F-Almeman/3D-EX .\n","authors":["Fatemah Almeman","Hadi Sheikhi","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2308.03043v1.pdf","comment":"11 pages (including references pages), 9 tables, and 1 figure. This\n  paper is submitted to RANLP2023"},{"id":"http://arxiv.org/abs/2210.09894v2","updated":"2023-08-06T06:02:06Z","published":"2022-10-18T14:33:03Z","title":"Taxonomy of Abstractive Dialogue Summarization: Scenarios, Approaches\n  and Future Directions","summary":"  Abstractive dialogue summarization is to generate a concise and fluent\nsummary covering the salient information in a dialogue among two or more\ninterlocutors. It has attracted great attention in recent years based on the\nmassive emergence of social communication platforms and an urgent requirement\nfor efficient dialogue information understanding and digestion. Different from\nnews or articles in traditional document summarization, dialogues bring unique\ncharacteristics and additional challenges, including different language styles\nand formats, scattered information, flexible discourse structures and unclear\ntopic boundaries. This survey provides a comprehensive investigation on\nexisting work for abstractive dialogue summarization from scenarios, approaches\nto evaluations. It categorizes the task into two broad categories according to\nthe type of input dialogues, i.e., open-domain and task-oriented, and presents\na taxonomy of existing techniques in three directions, namely, injecting\ndialogue features, designing auxiliary training tasks and using additional\ndata.A list of datasets under different scenarios and widely-accepted\nevaluation metrics are summarized for completeness. After that, the trends of\nscenarios and techniques are summarized, together with deep insights on\ncorrelations between extensively exploited features and different scenarios.\nBased on these analyses, we recommend future directions including more\ncontrolled and complicated scenarios, technical innovations and comparisons,\npublicly available datasets in special domains, etc.\n","authors":["Qi Jia","Yizhu Liu","Siyu Ren","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2210.09894v2.pdf","comment":"Under review at ACM Computing Surveys (CSUR), submitted in January\n  2022"},{"id":"http://arxiv.org/abs/2308.03024v1","updated":"2023-08-06T05:23:25Z","published":"2023-08-06T05:23:25Z","title":"Towards Scene-Text to Scene-Text Translation","summary":"  In this work, we study the task of ``visually\" translating scene text from a\nsource language (e.g., English) to a target language (e.g., Chinese). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe text, such as font, size, and background. There are several challenges\nassociated with this task, such as interpolating font to unseen characters and\npreserving text size and the background. To address these, we introduce VTNet,\na novel conditional diffusion-based method. To train the VTNet, we create a\nsynthetic cross-lingual dataset of 600K samples of scene text images in six\npopular languages, including English, Hindi, Tamil, Chinese, Bengali, and\nGerman. We evaluate the performance of VTnet through extensive experiments and\ncomparisons to related methods. Our model also surpasses the previous\nstate-of-the-art results on the conventional scene-text editing benchmarks.\nFurther, we present rigorous qualitative studies to understand the strengths\nand shortcomings of our model. Results show that our approach generalizes well\nto unseen words and fonts. We firmly believe our work can benefit real-world\napplications, such as text translation using a phone camera and translating\neducational materials. Code and data will be made publicly available.\n","authors":["Onkar Susladkar","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10457v2","updated":"2023-08-06T05:17:13Z","published":"2023-07-19T21:00:16Z","title":"Improving Pre-trained Language Models' Generalization","summary":"  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is\noften limited by their generalization problem, where their performance\ndrastically decreases when evaluated on examples that differ from the training\ndataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation\narises from PLMs' reliance on spurious correlations, which work well for\nfrequent example types but not for general examples. To address this issue, we\npropose a training approach called Mask-tuning, which integrates Masked\nLanguage Modeling (MLM) training objectives into the fine-tuning process to\nenhance PLMs' generalization. Comprehensive experiments demonstrate that\nMask-tuning surpasses current state-of-the-art techniques and enhances PLMs'\ngeneralization on OOD datasets while improving their performance on\nin-distribution datasets. The findings suggest that Mask-tuning improves the\nreusability of PLMs on unseen data, making them more practical and effective\nfor real-world applications.\n","authors":["Somayeh Ghanbarzadeh","Hamid Palangi","Yan Huang","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10457v2.pdf","comment":"An updated version being uploaded with a new title as: \"Improving the\n  Reusability of Pre-trained Language Models in Real-world Applications\""},{"id":"http://arxiv.org/abs/2212.05773v2","updated":"2023-08-06T02:10:07Z","published":"2022-12-12T08:51:30Z","title":"A Survey on Natural Language Processing for Programming","summary":"  Natural language processing for programming aims to use NLP techniques to\nassist programming. It is increasingly prevalent for its effectiveness in\nimproving productivity. Distinct from natural language, a programming language\nis highly structured and functional. Constructing a structure-based\nrepresentation and a functionality-oriented algorithm is at the heart of\nprogram understanding and generation. In this paper, we conduct a systematic\nreview covering tasks, datasets, evaluation methods, techniques, and models\nfrom the perspective of the structure-based and functionality-oriented\nproperty, aiming to understand the role of the two properties in each\ncomponent. Based on the analysis, we illustrate unexplored areas and suggest\npotential directions for future work.\n","authors":["Qingfu Zhu","Xianzhen Luo","Fang Liu","Cuiyun Gao","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2212.05773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02976v1","updated":"2023-08-06T00:16:04Z","published":"2023-08-06T00:16:04Z","title":"Spanish Pre-trained BERT Model and Evaluation Data","summary":"  The Spanish language is one of the top 5 spoken languages in the world.\nNevertheless, finding resources to train or evaluate Spanish language models is\nnot an easy task. In this paper we help bridge this gap by presenting a\nBERT-based language model pre-trained exclusively on Spanish data. As a second\ncontribution, we also compiled several tasks specifically for the Spanish\nlanguage in a single repository much in the spirit of the GLUE benchmark. By\nfine-tuning our pre-trained Spanish model, we obtain better results compared to\nother BERT-based models pre-trained on multilingual corpora for most of the\ntasks, even achieving a new state-of-the-art on some of them. We have publicly\nreleased our model, the pre-training data, and the compilation of the Spanish\nbenchmarks.\n","authors":["José Cañete","Gabriel Chaperon","Rodrigo Fuentes","Jou-Hui Ho","Hojin Kang","Jorge Pérez"],"pdf_url":"https://arxiv.org/pdf/2308.02976v1.pdf","comment":"Published as workshop paper at Practical ML for Developing Countries\n  Workshop @ ICLR 2020"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.03217v1","updated":"2023-08-06T22:20:09Z","published":"2023-08-06T22:20:09Z","title":"Local Consensus Enhanced Siamese Network with Reciprocal Loss for\n  Two-view Correspondence Learning","summary":"  Recent studies of two-view correspondence learning usually establish an\nend-to-end network to jointly predict correspondence reliability and relative\npose. We improve such a framework from two aspects. First, we propose a Local\nFeature Consensus (LFC) plugin block to augment the features of existing\nmodels. Given a correspondence feature, the block augments its neighboring\nfeatures with mutual neighborhood consensus and aggregates them to produce an\nenhanced feature. As inliers obey a uniform cross-view transformation and share\nmore consistent learned features than outliers, feature consensus strengthens\ninlier correlation and suppresses outlier distraction, which makes output\nfeatures more discriminative for classifying inliers/outliers. Second, existing\napproaches supervise network training with the ground truth correspondences and\nessential matrix projecting one image to the other for an input image pair,\nwithout considering the information from the reverse mapping. We extend\nexisting models to a Siamese network with a reciprocal loss that exploits the\nsupervision of mutual projection, which considerably promotes the matching\nperformance without introducing additional model parameters. Building upon\nMSA-Net, we implement the two proposals and experimentally achieve\nstate-of-the-art performance on benchmark datasets.\n","authors":["Linbo Wang","Jing Wu","Xianyong Fang","Zhengyi Liu","Chenjie Cao","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2308.03217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03203v1","updated":"2023-08-06T20:30:01Z","published":"2023-08-06T20:30:01Z","title":"Microvasculature Segmentation in Human BioMolecular Atlas Program\n  (HuBMAP)","summary":"  Image segmentation serves as a critical tool across a range of applications,\nencompassing autonomous driving's pedestrian detection and pre-operative tumor\ndelineation in the medical sector. Among these applications, we focus on the\nNational Institutes of Health's (NIH) Human BioMolecular Atlas Program\n(HuBMAP), a significant initiative aimed at creating detailed cellular maps of\nthe human body. In this study, we concentrate on segmenting various\nmicrovascular structures in human kidneys, utilizing 2D Periodic Acid-Schiff\n(PAS)-stained histology images. Our methodology begins with a foundational\nFastAI U-Net model, upon which we investigate alternative backbone\narchitectures, delve into deeper models, and experiment with Feature Pyramid\nNetworks. We rigorously evaluate these varied approaches by benchmarking their\nperformance against our baseline U-Net model. This study thus offers a\ncomprehensive exploration of cutting-edge segmentation techniques, providing\nvaluable insights for future research in the field.\n","authors":["Youssef Sultan","Yongqiang Wang","James Scanlon","Lisa D'lima"],"pdf_url":"https://arxiv.org/pdf/2308.03203v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.11466v3","updated":"2023-08-06T20:19:32Z","published":"2023-07-21T10:02:02Z","title":"MatSpectNet: Material Segmentation Network with Domain-Aware and\n  Physically-Constrained Hyperspectral Reconstruction","summary":"  Achieving accurate material segmentation for 3-channel RGB images is\nchallenging due to the considerable variation in a material's appearance.\nHyperspectral images, which are sets of spectral measurements sampled at\nmultiple wavelengths, theoretically offer distinct information for material\nidentification, as variations in intensity of electromagnetic radiation\nreflected by a surface depend on the material composition of a scene. However,\nexisting hyperspectral datasets are impoverished regarding the number of images\nand material categories for the dense material segmentation task, and\ncollecting and annotating hyperspectral images with a spectral camera is\nprohibitively expensive. To address this, we propose a new model, the\nMatSpectNet to segment materials with recovered hyperspectral images from RGB\nimages. The network leverages the principles of colour perception in modern\ncameras to constrain the reconstructed hyperspectral images and employs the\ndomain adaptation method to generalise the hyperspectral reconstruction\ncapability from a spectral recovery dataset to material segmentation datasets.\nThe reconstructed hyperspectral images are further filtered using learned\nresponse curves and enhanced with human perception. The performance of\nMatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces\ndataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase\nin average pixel accuracy and a 3.42% improvement in mean class accuracy\ncompared with the most recent publication. The project code is attached to the\nsupplementary material and will be published on GitHub.\n","authors":["Yuwen Heng","Yihong Wu","Jiawen Chen","Srinandan Dasmahapatra","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.11466v3.pdf","comment":"7 pages main paper"},{"id":"http://arxiv.org/abs/2308.03202v1","updated":"2023-08-06T20:19:06Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":"  Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process.\n  To this end, we propose a new task, named source-free domain adaptive HPE,\nwhich aims to address the challenges of cross-domain learning of HPE without\naccess to source data during the adaptation process. We further propose a novel\nframework that consists of three models: source model, intermediate model, and\ntarget model, which explores the task from both source-protect and\ntarget-relevant perspectives. The source-protect module preserves source\ninformation more effectively while resisting noise, and the target-relevant\nmodule reduces the sparsity of spatial representations by building a novel\nspatial probability space, and pose-specific contrastive learning and\ninformation maximization are proposed on the basis of this space. Comprehensive\nexperiments on several domain adaptive HPE benchmarks show that the proposed\nmethod outperforms existing approaches by a considerable margin.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2008.08633v3","updated":"2023-08-06T20:17:31Z","published":"2020-08-19T18:56:49Z","title":"Spatio-Temporal EEG Representation Learning on Riemannian Manifold and\n  Euclidean Space","summary":"  We present a novel deep neural architecture for learning electroencephalogram\n(EEG). To learn the spatial information, our model first obtains the Riemannian\nmean and distance from spatial covariance matrices (SCMs) on a Riemannian\nmanifold. We then project the spatial information onto a Euclidean space via\ntangent space learning. Following, two fully connected layers are used to learn\nthe spatial information embeddings. Moreover, our proposed method learns the\ntemporal information via differential entropy and logarithm power spectrum\ndensity features extracted from EEG signals in a Euclidean space using a deep\nlong short-term memory network with a soft attention mechanism. To combine the\nspatial and temporal information, we use an effective fusion strategy, which\nlearns attention weights applied to embedding-specific features for decision\nmaking. We evaluate our proposed framework on four public datasets across three\npopular EEG-related tasks, notably emotion recognition, vigilance estimation,\nand motor imagery classification, containing various types of tasks such as\nbinary classification, multi-class classification, and regression. Our proposed\narchitecture outperforms other methods on SEED-VIG, and approaches the\nstate-of-the-art on the other three datasets (SEED, BCI-IV 2A, and BCI-IV 2B),\nshowing the robustness of our framework in EEG representation learning. The\nsource code of our paper is publicly available at\nhttps://github.com/guangyizhangbci/EEG_Riemannian.\n","authors":["Guangyi Zhang","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2008.08633v3.pdf","comment":"Accepted in IEEE Transactions on Emerging Topics in Computational\n  Intelligence. 15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.03200v1","updated":"2023-08-06T20:10:12Z","published":"2023-08-06T20:10:12Z","title":"Unmasking the Invisible: Finding Location-Specific Aggregated Air\n  Quality Index with Smartphone-Captured Images","summary":"  The prevalence and mobility of smartphones make these a widely used tool for\nenvironmental health research. However, their potential for determining\naggregated air quality index (AQI) based on PM2.5 concentration in specific\nlocations remains largely unexplored in the existing literature. In this paper,\nwe thoroughly examine the challenges associated with predicting\nlocation-specific PM2.5 concentration using images taken with smartphone\ncameras. The focus of our study is on Dhaka, the capital of Bangladesh, due to\nits significant air pollution levels and the large population exposed to it.\nOur research involves the development of a Deep Convolutional Neural Network\n(DCNN), which we train using over a thousand outdoor images taken and\nannotated. These photos are captured at various locations in Dhaka, and their\nlabels are based on PM2.5 concentration data obtained from the local US\nconsulate, calculated using the NowCast algorithm. Through supervised learning,\nour model establishes a correlation index during training, enhancing its\nability to function as a Picture-based Predictor of PM2.5 Concentration (PPPC).\nThis enables the algorithm to calculate an equivalent daily averaged AQI index\nfrom a smartphone image. Unlike, popular overly parameterized models, our model\nshows resource efficiency since it uses fewer parameters. Furthermore, test\nresults indicate that our model outperforms popular models like ViT and INN, as\nwell as popular CNN-based models such as VGG19, ResNet50, and MobileNetV2, in\npredicting location-specific PM2.5 concentration. Our dataset is the first\npublicly available collection that includes atmospheric images and\ncorresponding PM2.5 measurements from Dhaka. Our code and dataset will be made\npublic when publishing the paper.\n","authors":["Joyanta Jyoti Mondal","Md. Farhadul Islam","Raima Islam","Nowsin Kabir Rhidi","A. B. M. Alim Al Islam","Meem Arafat Manab","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2308.03200v1.pdf","comment":"15 pages, 7 figures, submitted to Nature Scientific Reports"},{"id":"http://arxiv.org/abs/2304.01715v2","updated":"2023-08-06T20:08:58Z","published":"2023-04-04T11:25:23Z","title":"Towards Open-Vocabulary Video Instance Segmentation","summary":"  Video Instance Segmentation (VIS) aims at segmenting and categorizing objects\nin videos from a closed set of training categories, lacking the generalization\nability to handle novel categories in real-world videos. To address this\nlimitation, we make the following three contributions. First, we introduce the\nnovel task of Open-Vocabulary Video Instance Segmentation, which aims to\nsimultaneously segment, track, and classify objects in videos from open-set\ncategories, including novel categories unseen during training. Second, to\nbenchmark Open-Vocabulary VIS, we collect a Large-Vocabulary Video Instance\nSegmentation dataset (LV-VIS), that contains well-annotated objects from 1,196\ndiverse categories, significantly surpassing the category size of existing\ndatasets by more than one order of magnitude. Third, we propose an efficient\nMemory-Induced Transformer architecture, OV2Seg, to first achieve\nOpen-Vocabulary VIS in an end-to-end manner with near real-time inference\nspeed. Extensive experiments on LV-VIS and four existing VIS datasets\ndemonstrate the strong zero-shot generalization ability of OV2Seg on novel\ncategories. The dataset and code are released here\nhttps://github.com/haochenheheda/LVVIS.\n","authors":["Haochen Wang","Cilin Yan","Shuai Wang","Xiaolong Jiang","XU Tang","Yao Hu","Weidi Xie","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2304.01715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03193v1","updated":"2023-08-06T19:20:18Z","published":"2023-08-06T19:20:18Z","title":"Syn-Mediverse: A Multimodal Synthetic Dataset for Intelligent Scene\n  Understanding of Healthcare Facilities","summary":"  Safety and efficiency are paramount in healthcare facilities where the lives\nof patients are at stake. Despite the adoption of robots to assist medical\nstaff in challenging tasks such as complex surgeries, human expertise is still\nindispensable. The next generation of autonomous healthcare robots hinges on\ntheir capacity to perceive and understand their complex and frenetic\nenvironments. While deep learning models are increasingly used for this\npurpose, they require extensive annotated training data which is impractical to\nobtain in real-world healthcare settings. To bridge this gap, we present\nSyn-Mediverse, the first hyper-realistic multimodal synthetic dataset of\ndiverse healthcare facilities. Syn-Mediverse contains over \\num{48000} images\nfrom a simulated industry-standard optical tracking camera and provides more\nthan 1.5M annotations spanning five different scene understanding tasks\nincluding depth estimation, object detection, semantic segmentation, instance\nsegmentation, and panoptic segmentation. We demonstrate the complexity of our\ndataset by evaluating the performance on a broad range of state-of-the-art\nbaselines for each task. To further advance research on scene understanding of\nhealthcare facilities, along with the public dataset we provide an online\nevaluation benchmark available at \\url{http://syn-mediverse.cs.uni-freiburg.de}\n","authors":["Rohit Mohan","José Arce","Sassan Mokhtar","Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2308.03193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03189v1","updated":"2023-08-06T18:44:37Z","published":"2023-08-06T18:44:37Z","title":"Understanding Biometric Entropy and Iris Capacity: Avoiding Identity\n  Collisions on National Scales","summary":"  The numbers of persons who can be enrolled by their iris patterns with no\nidentity collisions is studied in relation to the biometric entropy extracted,\nand the decision operating threshold. The population size at which identity\ncollision becomes likelier than not, given those variables, defines iris\n\"capacity.\" The general solution to this combinatorial problem is derived, in\nanalogy with the well-known \"birthday problem.\" Its application to unique\nbiometric identification on national population scales is shown, referencing\nempirical data from US NIST (National Institute of Standards and Technology)\ntrials involving 1.2 trillion (1.2 x 10^(12) ) iris comparisons. The entropy of\na given person's two iris patterns suffices for global identity uniqueness.\n","authors":["John Daugman"],"pdf_url":"https://arxiv.org/pdf/2308.03189v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.03183v1","updated":"2023-08-06T18:28:26Z","published":"2023-08-06T18:28:26Z","title":"Photorealistic and Identity-Preserving Image-Based Emotion Manipulation\n  with Latent Diffusion Models","summary":"  In this paper, we investigate the emotion manipulation capabilities of\ndiffusion models with \"in-the-wild\" images, a rather unexplored application\narea relative to the vast and rapidly growing literature for image-to-image\ntranslation tasks. Our proposed method encapsulates several pieces of prior\nwork, with the most important being Latent Diffusion models and text-driven\nmanipulation with CLIP latents. We conduct extensive qualitative and\nquantitative evaluations on AffectNet, demonstrating the superiority of our\napproach in terms of image quality and realism, while achieving competitive\nresults relative to emotion translation compared to a variety of GAN-based\ncounterparts. Code is released as a publicly available repo.\n","authors":["Ioannis Pikoulis","Panagiotis P. Filntisis","Petros Maragos"],"pdf_url":"https://arxiv.org/pdf/2308.03183v1.pdf","comment":"14 pages, 5 tables, 11 figures"},{"id":"http://arxiv.org/abs/2308.03177v1","updated":"2023-08-06T18:07:45Z","published":"2023-08-06T18:07:45Z","title":"Boosting Few-shot 3D Point Cloud Segmentation via Query-Guided\n  Enhancement","summary":"  Although extensive research has been conducted on 3D point cloud\nsegmentation, effectively adapting generic models to novel categories remains a\nformidable challenge. This paper proposes a novel approach to improve point\ncloud few-shot segmentation (PC-FSS) models. Unlike existing PC-FSS methods\nthat directly utilize categorical information from support prototypes to\nrecognize novel classes in query samples, our method identifies two critical\naspects that substantially enhance model performance by reducing contextual\ngaps between support prototypes and query features. Specifically, we (1) adapt\nsupport background prototypes to match query context while removing extraneous\ncues that may obscure foreground and background in query samples, and (2)\nholistically rectify support prototypes under the guidance of query features to\nemulate the latter having no semantic gap to the query targets. Our proposed\ndesigns are agnostic to the feature extractor, rendering them readily\napplicable to any prototype-based methods. The experimental results on S3DIS\nand ScanNet demonstrate notable practical benefits, as our approach achieves\nsignificant improvements while still maintaining high efficiency. The code for\nour approach is available at\nhttps://github.com/AaronNZH/Boosting-Few-shot-3D-Point-Cloud-Segmentation-via-Query-Guided-Enhancement\n","authors":["Zhenhua Ning","Zhuotao Tian","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03177v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03166v1","updated":"2023-08-06T17:27:08Z","published":"2023-08-06T17:27:08Z","title":"Strategic Preys Make Acute Predators: Enhancing Camouflaged Object\n  Detectors by Generating Camouflaged Objects","summary":"  Camouflaged object detection (COD) is the challenging task of identifying\ncamouflaged objects visually blended into surroundings. Albeit achieving\nremarkable success, existing COD detectors still struggle to obtain precise\nresults in some challenging cases. To handle this problem, we draw inspiration\nfrom the prey-vs-predator game that leads preys to develop better camouflage\nand predators to acquire more acute vision systems and develop algorithms from\nboth the prey side and the predator side. On the prey side, we propose an\nadversarial training framework, Camouflageator, which introduces an auxiliary\ngenerator to generate more camouflaged objects that are harder for a COD method\nto detect. Camouflageator trains the generator and detector in an adversarial\nway such that the enhanced auxiliary generator helps produce a stronger\ndetector. On the predator side, we introduce a novel COD method, called\nInternal Coherence and Edge Guidance (ICEG), which introduces a camouflaged\nfeature coherence module to excavate the internal coherence of camouflaged\nobjects, striving to obtain more complete segmentation results. Additionally,\nICEG proposes a novel edge-guided separated calibration module to remove false\npredictions to avoid obtaining ambiguous boundaries. Extensive experiments show\nthat ICEG outperforms existing COD detectors and Camouflageator is flexible to\nimprove various COD detectors, including ICEG, which brings state-of-the-art\nCOD performance.\n","authors":["Chunming He","Kai Li","Yachao Zhang","Yulun Zhang","Zhenhua Guo","Xiu Li","Martin Danelljan","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03166v1.pdf","comment":"10 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2302.11408v2","updated":"2023-08-06T17:24:21Z","published":"2023-02-22T14:43:33Z","title":"ASSET: Robust Backdoor Data Detection Across a Multiplicity of Deep\n  Learning Paradigms","summary":"  Backdoor data detection is traditionally studied in an end-to-end supervised\nlearning (SL) setting. However, recent years have seen the proliferating\nadoption of self-supervised learning (SSL) and transfer learning (TL), due to\ntheir lesser need for labeled data. Successful backdoor attacks have also been\ndemonstrated in these new settings. However, we lack a thorough understanding\nof the applicability of existing detection methods across a variety of learning\nsettings. By evaluating 56 attack settings, we show that the performance of\nmost existing detection methods varies significantly across different attacks\nand poison ratios, and all fail on the state-of-the-art clean-label attack. In\naddition, they either become inapplicable or suffer large performance losses\nwhen applied to SSL and TL. We propose a new detection method called Active\nSeparation via Offset (ASSET), which actively induces different model behaviors\nbetween the backdoor and clean samples to promote their separation. We also\nprovide procedures to adaptively select the number of suspicious points to\nremove. In the end-to-end SL setting, ASSET is superior to existing methods in\nterms of consistency of defensive performance across different attacks and\nrobustness to changes in poison ratios; in particular, it is the only method\nthat can detect the state-of-the-art clean-label attack. Moreover, ASSET's\naverage detection rates are higher than the best existing methods in SSL and\nTL, respectively, by 69.3% and 33.2%, thus providing the first practical\nbackdoor defense for these new DL settings. We open-source the project to drive\nfurther development and encourage engagement:\nhttps://github.com/ruoxi-jia-group/ASSET.\n","authors":["Minzhou Pan","Yi Zeng","Lingjuan Lyu","Xue Lin","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2302.11408v2.pdf","comment":"18 pages, with 13 pages of main text"},{"id":"http://arxiv.org/abs/2308.03164v1","updated":"2023-08-06T17:19:51Z","published":"2023-08-06T17:19:51Z","title":"FireFly A Synthetic Dataset for Ember Detection in Wildfire","summary":"  This paper presents \"FireFly\", a synthetic dataset for ember detection\ncreated using Unreal Engine 4 (UE4), designed to overcome the current lack of\nember-specific training resources. To create the dataset, we present a tool\nthat allows the automated generation of the synthetic labeled dataset with\nadjustable parameters, enabling data diversity from various environmental\nconditions, making the dataset both diverse and customizable based on user\nrequirements. We generated a total of 19,273 frames that have been used to\nevaluate FireFly on four popular object detection models. Further to minimize\nhuman intervention, we leveraged a trained model to create a semi-automatic\nlabeling process for real-life ember frames. Moreover, we demonstrated an up to\n8.57% improvement in mean Average Precision (mAP) in real-world wildfire\nscenarios compared to models trained exclusively on a small real dataset.\n","authors":["Yue Hu","Xinan Ye","Yifei Liu","Souvik Kundu","Gourav Datta","Srikar Mutnuri","Namo Asavisanu","Nora Ayanian","Konstantinos Psounis","Peter Beerel"],"pdf_url":"https://arxiv.org/pdf/2308.03164v1.pdf","comment":"Artificial Intelligence (AI) and Humanitarian Assistance and Disaster\n  Recovery (HADR) workshop, ICCV 2023 in Paris, France"},{"id":"http://arxiv.org/abs/2305.09533v2","updated":"2023-08-06T17:18:15Z","published":"2023-05-16T15:26:09Z","title":"NightHazeFormer: Single Nighttime Haze Removal Using Prior Query\n  Transformer","summary":"  Nighttime image dehazing is a challenging task due to the presence of\nmultiple types of adverse degrading effects including glow, haze, blurry,\nnoise, color distortion, and so on. However, most previous studies mainly focus\non daytime image dehazing or partial degradations presented in nighttime hazy\nscenes, which may lead to unsatisfactory restoration results. In this paper, we\npropose an end-to-end transformer-based framework for nighttime haze removal,\ncalled NightHazeFormer. Our proposed approach consists of two stages:\nsupervised pre-training and semi-supervised fine-tuning. During the\npre-training stage, we introduce two powerful priors into the transformer\ndecoder to generate the non-learnable prior queries, which guide the model to\nextract specific degradations. For the fine-tuning, we combine the generated\npseudo ground truths with input real-world nighttime hazy images as paired\nimages and feed into the synthetic domain to fine-tune the pre-trained model.\nThis semi-supervised fine-tuning paradigm helps improve the generalization to\nreal domain. In addition, we also propose a large-scale synthetic dataset\ncalled UNREAL-NH, to simulate the real-world nighttime haze scenarios\ncomprehensively. Extensive experiments on several synthetic and real-world\ndatasets demonstrate the superiority of our NightHazeFormer over\nstate-of-the-art nighttime haze removal methods in terms of both visually and\nquantitatively.\n","authors":["Yun Liu","Zhongsheng Yan","Sixiang Chen","Tian Ye","Wenqi Ren","Erkang Chen"],"pdf_url":"https://arxiv.org/pdf/2305.09533v2.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.03163v1","updated":"2023-08-06T17:18:04Z","published":"2023-08-06T17:18:04Z","title":"CGBA: Curvature-aware Geometric Black-box Attack","summary":"  Decision-based black-box attacks often necessitate a large number of queries\nto craft an adversarial example. Moreover, decision-based attacks based on\nquerying boundary points in the estimated normal vector direction often suffer\nfrom inefficiency and convergence issues. In this paper, we propose a novel\nquery-efficient curvature-aware geometric decision-based black-box attack\n(CGBA) that conducts boundary search along a semicircular path on a restricted\n2D plane to ensure finding a boundary point successfully irrespective of the\nboundary curvature. While the proposed CGBA attack can work effectively for an\narbitrary decision boundary, it is particularly efficient in exploiting the low\ncurvature to craft high-quality adversarial examples, which is widely seen and\nexperimentally verified in commonly used classifiers under non-targeted\nattacks. In contrast, the decision boundaries often exhibit higher curvature\nunder targeted attacks. Thus, we develop a new query-efficient variant, CGBA-H,\nthat is adapted for the targeted attack. In addition, we further design an\nalgorithm to obtain a better initial boundary point at the expense of some\nextra queries, which considerably enhances the performance of the targeted\nattack. Extensive experiments are conducted to evaluate the performance of our\nproposed methods against some well-known classifiers on the ImageNet and\nCIFAR10 datasets, demonstrating the superiority of CGBA and CGBA-H over\nstate-of-the-art non-targeted and targeted attacks, respectively. The source\ncode is available at https://github.com/Farhamdur/CGBA.\n","authors":["Md Farhamdur Reza","Ali Rahmati","Tianfu Wu","Huaiyu Dai"],"pdf_url":"https://arxiv.org/pdf/2308.03163v1.pdf","comment":"This paper is accepted to publish in ICCV"},{"id":"http://arxiv.org/abs/2306.01953v2","updated":"2023-08-06T17:17:04Z","published":"2023-06-02T23:29:28Z","title":"Invisible Image Watermarks Are Provably Removable Using Generative AI","summary":"  Invisible watermarks safeguard images' copyright by embedding hidden messages\nonly detectable by owners. They also prevent people from misusing images,\nespecially those generated by AI models. We propose a family of regeneration\nattacks to remove these invisible watermarks. The proposed attack method first\nadds random noise to an image to destroy the watermark and then reconstructs\nthe image. This approach is flexible and can be instantiated with many existing\nimage-denoising algorithms and pre-trained generative models such as diffusion\nmodels. Through formal proofs and empirical results, we show that all invisible\nwatermarks are vulnerable to the proposed attack. For a particularly resilient\nwatermark, RivaGAN, regeneration attacks remove 93-99% of the invisible\nwatermarks while the baseline attacks remove no more than 3%. However, if we do\nnot require the watermarked image to look the same as the original one,\nwatermarks that keep the image semantically similar can be an alternative\ndefense against our attack. Our finding underscores the need for a shift in\nresearch/industry emphasis from invisible watermarks to semantically similar\nones. Code is available at https://github.com/XuandongZhao/WatermarkAttacker.\n","authors":["Xuandong Zhao","Kexun Zhang","Zihao Su","Saastha Vasan","Ilya Grishchenko","Christopher Kruegel","Giovanni Vigna","Yu-Xiang Wang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2306.01953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.08722v3","updated":"2023-08-06T17:14:33Z","published":"2020-11-17T15:49:22Z","title":"RAIST: Learning Risk Aware Traffic Interactions via Spatio-Temporal\n  Graph Convolutional Networks","summary":"  A key aspect of driving a road vehicle is to interact with other road users,\nassess their intentions and make risk-aware tactical decisions. An intuitive\napproach to enabling an intelligent automated driving system would be\nincorporating some aspects of human driving behavior. To this end, we propose a\nnovel driving framework for egocentric views based on spatio-temporal traffic\ngraphs. The traffic graphs model not only the spatial interactions amongst the\nroad users but also their individual intentions through temporally associated\nmessage passing. We leverage a spatio-temporal graph convolutional network\n(ST-GCN) to train the graph edges. These edges are formulated using\nparameterized functions of 3D positions and scene-aware appearance features of\nroad agents. Along with tactical behavior prediction, it is crucial to evaluate\nthe risk-assessing ability of the proposed framework. We claim that our\nframework learns risk-aware representations by improving on the task of risk\nobject identification, especially in identifying objects with vulnerable\ninteractions like pedestrians and cyclists.\n","authors":["Videsh Suman","Phu Pham","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2011.08722v3.pdf","comment":"To appear in 2023 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2308.03151v1","updated":"2023-08-06T15:56:31Z","published":"2023-08-06T15:56:31Z","title":"Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating\n  Vision-Language Models","summary":"  Vision-language models (VLMs) have shown impressive performance in\nsubstantial downstream multi-modal tasks. However, only comparing the\nfine-tuned performance on downstream tasks leads to the poor interpretability\nof VLMs, which is adverse to their future improvement. Several prior works have\nidentified this issue and used various probing methods under a zero-shot\nsetting to detect VLMs' limitations, but they all examine VLMs using general\ndatasets instead of specialized ones. In practical applications, VLMs are\nusually applied to specific scenarios, such as e-commerce and news fields, so\nthe generalization of VLMs in specific domains should be given more attention.\nIn this paper, we comprehensively investigate the capabilities of popular VLMs\nin a specific field, the food domain. To this end, we build a food caption\ndataset, Food-500 Cap, which contains 24,700 food images with 494 categories.\nEach image is accompanied by a detailed caption, including fine-grained\nattributes of food, such as the ingredient, shape, and color. We also provide a\nculinary culture taxonomy that classifies each food category based on its\ngeographic origin in order to better analyze the performance differences of VLM\nin different regions. Experiments on our proposed datasets demonstrate that\npopular VLMs underperform in the food domain compared with their performance in\nthe general domain. Furthermore, our research reveals severe bias in VLMs'\nability to handle food items from different geographic regions. We adopt\ndiverse probing methods and evaluate nine VLMs belonging to different\narchitectures to verify the aforementioned observations. We hope that our study\nwill bring researchers' attention to VLM's limitations when applying them to\nthe domain of food or culinary cultures, and spur further investigations to\naddress this issue.\n","authors":["Zheng Ma","Mianzhi Pan","Wenhan Wu","Kanzhi Cheng","Jianbing Zhang","Shujian Huang","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03151v1.pdf","comment":"Accepted at ACM Multimedia (ACMMM) 2023"},{"id":"http://arxiv.org/abs/2302.11364v2","updated":"2023-08-06T15:36:48Z","published":"2023-02-22T13:32:58Z","title":"Vision-Based Estimation of Small Body Rotational State during the\n  Approach Phase","summary":"  The heterogeneity of the small body population complicates the prediction of\nsmall body properties before the spacecraft's arrival. In the context of\nautonomous small body exploration, it is crucial to develop algorithms that\nestimate the small body characteristics before orbit insertion and close\nproximity operations. This paper develops a vision-based estimation of the\nsmall-body rotational state (i.e., the center of rotation and rotation axis\ndirection) during the approach phase. In this mission phase, the spacecraft\nobserves the rotating celestial body and tracks features in images. As feature\ntracks are the projection of the landmarks' circular movement, the possible\nrotation axes are computed. Then, the rotation axis solution is chosen among\nthe possible candidates by exploiting feature motion and a heuristic approach.\nFinally, the center of rotation is estimated from the center of brightness. The\nalgorithm is tested on more than 800 test cases with two different asteroids\n(i.e., Bennu and Itokawa), three different lighting conditions, and more than\n100 different rotation axis orientations. Each test case is composed of about\n250 synthetic images of the asteroid which are used to track features and\ndetermine the rotational state. Results show that the error between the true\nrotation axis and its estimation is below $10^{\\circ}$ for $80\\%$ of the\nconsidered test cases, implying that the proposed algorithm is a suitable\nmethod for autonomous small body characterization.\n","authors":["Paolo Panicucci","Jérémy Lebreton","Roland Brochard","Emmanuel Zenou","Michel Delpech"],"pdf_url":"https://arxiv.org/pdf/2302.11364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03139v1","updated":"2023-08-06T15:32:16Z","published":"2023-08-06T15:32:16Z","title":"PNN: From proximal algorithms to robust unfolded image denoising\n  networks and Plug-and-Play methods","summary":"  A common approach to solve inverse imaging problems relies on finding a\nmaximum a posteriori (MAP) estimate of the original unknown image, by solving a\nminimization problem. In thiscontext, iterative proximal algorithms are widely\nused, enabling to handle non-smooth functions and linear operators. Recently,\nthese algorithms have been paired with deep learning strategies, to further\nimprove the estimate quality. In particular, proximal neural networks (PNNs)\nhave been introduced, obtained by unrolling a proximal algorithm as for finding\na MAP estimate, but over a fixed number of iterations, with learned linear\noperators and parameters. As PNNs are based on optimization theory, they are\nvery flexible, and can be adapted to any image restoration task, as soon as a\nproximal algorithm can solve it. They further have much lighter architectures\nthan traditional networks. In this article we propose a unified framework to\nbuild PNNs for the Gaussian denoising task, based on both the dual-FB and the\nprimal-dual Chambolle-Pock algorithms. We further show that accelerated\ninertial versions of these algorithms enable skip connections in the associated\nNN layers. We propose different learning strategies for our PNN framework, and\ninvestigate their robustness (Lipschitz property) and denoising efficiency.\nFinally, we assess the robustness of our PNNs when plugged in a\nforward-backward algorithm for an image deblurring problem.\n","authors":["Hoang Trieu Vy Le","Audrey Repetti","Nelly Pustelnik"],"pdf_url":"https://arxiv.org/pdf/2308.03139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03135v1","updated":"2023-08-06T15:05:42Z","published":"2023-08-06T15:05:42Z","title":"E-CLIP: Towards Label-efficient Event-based Open-world Understanding by\n  CLIP","summary":"  Contrasting Language-image pertaining (CLIP) has recently shown promising\nopen-world and few-shot performance on 2D image-based recognition tasks.\nHowever, the transferred capability of CLIP to the novel event camera data\nstill remains under-explored. In particular, due to the modality gap with the\nimage-text data and the lack of large-scale datasets, achieving this goal is\nnon-trivial and thus requires significant research innovation. In this paper,\nwe propose E-CLIP, a novel and effective framework that unleashes the potential\nof CLIP for event-based recognition to compensate for the lack of large-scale\nevent-based datasets. Our work addresses two crucial challenges: 1) how to\ngeneralize CLIP's visual encoder to event data while fully leveraging events'\nunique properties, e.g., sparsity and high temporal resolution; 2) how to\neffectively align the multi-modal embeddings, i.e., image, text, and events. To\nthis end, we first introduce a novel event encoder that subtly models the\ntemporal information from events and meanwhile generates event prompts to\npromote the modality bridging. We then design a text encoder that generates\ncontent prompts and utilizes hybrid text prompts to enhance the E-CLIP's\ngeneralization ability across diverse datasets. With the proposed event\nencoder, text encoder, and original image encoder, a novel Hierarchical Triple\nContrastive Alignment (HTCA) module is introduced to jointly optimize the\ncorrelation and enable efficient knowledge transfer among the three modalities.\nWe conduct extensive experiments on two recognition benchmarks, and the results\ndemonstrate that our E-CLIP outperforms existing methods by a large margin of\n+3.94% and +4.62% on the N-Caltech dataset, respectively, in both fine-tuning\nand few-shot settings. Moreover, our E-CLIP can be flexibly extended to the\nevent retrieval task using both text or image queries, showing plausible\nperformance.\n","authors":["Jiazhou Zhou","Xu Zheng","Yuanhuiyi Lyu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00628v2","updated":"2023-08-06T14:47:00Z","published":"2023-08-01T15:55:41Z","title":"Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation\n  in Outdoor Scenes","summary":"  3D human pose estimation in outdoor environments has garnered increasing\nattention recently. However, prevalent 3D human pose datasets pertaining to\noutdoor scenes lack diversity, as they predominantly utilize only one type of\nmodality (RGB image or pointcloud), and often feature only one individual\nwithin each scene. This limited scope of dataset infrastructure considerably\nhinders the variability of available data. In this article, we propose\nHuman-M3, an outdoor multi-modal multi-view multi-person human pose database\nwhich includes not only multi-view RGB videos of outdoor scenes but also\ncorresponding pointclouds. In order to obtain accurate human poses, we propose\nan algorithm based on multi-modal data input to generate ground truth\nannotation. This benefits from robust pointcloud detection and tracking, which\nsolves the problem of inaccurate human localization and matching ambiguity that\nmay exist in previous multi-view RGB videos in outdoor multi-person scenes, and\ngenerates reliable ground truth annotations. Evaluation of multiple different\nmodalities algorithms has shown that this database is challenging and suitable\nfor future research. Furthermore, we propose a 3D human pose estimation\nalgorithm based on multi-modal data input, which demonstrates the advantages of\nmulti-modal data input for 3D human pose estimation. Code and data will be\nreleased on https://github.com/soullessrobot/Human-M3-Dataset.\n","authors":["Bohao Fan","Siqi Wang","Wenxuan Guo","Wenzhao Zheng","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.00628v2.pdf","comment":"Code and data will be released on\n  https://github.com/soullessrobot/Human-M3-Dataset"},{"id":"http://arxiv.org/abs/2307.01759v2","updated":"2023-08-06T14:22:46Z","published":"2023-07-04T15:00:06Z","title":"Pretraining is All You Need: A Multi-Atlas Enhanced Transformer\n  Framework for Autism Spectrum Disorder Classification","summary":"  Autism spectrum disorder (ASD) is a prevalent psychiatric condition\ncharacterized by atypical cognitive, emotional, and social patterns. Timely and\naccurate diagnosis is crucial for effective interventions and improved outcomes\nin individuals with ASD. In this study, we propose a novel Multi-Atlas Enhanced\nTransformer framework, METAFormer, ASD classification. Our framework utilizes\nresting-state functional magnetic resonance imaging data from the ABIDE I\ndataset, comprising 406 ASD and 476 typical control (TC) subjects. METAFormer\nemploys a multi-atlas approach, where flattened connectivity matrices from the\nAAL, CC200, and DOS160 atlases serve as input to the transformer encoder.\nNotably, we demonstrate that self-supervised pretraining, involving the\nreconstruction of masked values from the input, significantly enhances\nclassification performance without the need for additional or separate training\ndata. Through stratified cross-validation, we evaluate the proposed framework\nand show that it surpasses state-of-the-art performance on the ABIDE I dataset,\nwith an average accuracy of 83.7% and an AUC-score of 0.832. The code for our\nframework is available at https://github.com/Lugges991/METAFormer\n","authors":["Lucas Mahler","Qi Wang","Julius Steiglechner","Florian Birk","Samuel Heczko","Klaus Scheffler","Gabriele Lohmann"],"pdf_url":"https://arxiv.org/pdf/2307.01759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03121v1","updated":"2023-08-06T14:09:00Z","published":"2023-08-06T14:09:00Z","title":"NNVISR: Bring Neural Network Video Interpolation and Super Resolution\n  into Video Processing Framework","summary":"  We present NNVISR - an open-source filter plugin for the VapourSynth video\nprocessing framework, which facilitates the application of neural networks for\nvarious kinds of video enhancing tasks, including denoising, super resolution,\ninterpolation, and spatio-temporal super-resolution. NNVISR fills the gap\nbetween video enhancement neural networks and video processing pipelines, by\naccepting any network that enhances a group of frames, and handling all other\nnetwork agnostic details during video processing. NNVISR is publicly released\nat https://github.com/tongyuantongyu/vs-NNVISR.\n","authors":["Yuan Tong","Mengshun Hu","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15049v2","updated":"2023-08-06T14:05:38Z","published":"2023-07-27T17:56:05Z","title":"Regularized Mask Tuning: Uncovering Hidden Knowledge in Pre-trained\n  Vision-Language Models","summary":"  Prompt tuning and adapter tuning have shown great potential in transferring\npre-trained vision-language models (VLMs) to various downstream tasks. In this\nwork, we design a new type of tuning method, termed as regularized mask tuning,\nwhich masks the network parameters through a learnable selection. Inspired by\nneural pathways, we argue that the knowledge required by a downstream task\nalready exists in the pre-trained weights but just gets concealed in the\nupstream pre-training stage. To bring the useful knowledge back into light, we\nfirst identify a set of parameters that are important to a given downstream\ntask, then attach a binary mask to each parameter, and finally optimize these\nmasks on the downstream data with the parameters frozen. When updating the\nmask, we introduce a novel gradient dropout strategy to regularize the\nparameter selection, in order to prevent the model from forgetting old\nknowledge and overfitting the downstream data. Experimental results on 11\ndatasets demonstrate the consistent superiority of our method over previous\nalternatives. It is noteworthy that we manage to deliver 18.73% performance\nimprovement compared to the zero-shot CLIP via masking an average of only 2.56%\nparameters. Furthermore, our method is synergistic with most existing\nparameter-efficient tuning methods and can boost the performance on top of\nthem. Project page can be found here (https://wuw2019.github.io/R-AMT/).\n","authors":["Kecheng Zheng","Wei Wu","Ruili Feng","Kai Zhu","Jiawei Liu","Deli Zhao","Zheng-Jun Zha","Wei Chen","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2307.15049v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.00086v3","updated":"2023-08-06T13:37:00Z","published":"2023-02-28T21:06:36Z","title":"Applying Plain Transformers to Real-World Point Clouds","summary":"  To apply transformer-based models to point cloud understanding, many previous\nworks modify the architecture of transformers by using, e.g., local attention\nand down-sampling. Although they have achieved promising results, earlier works\non transformers for point clouds have two issues. First, the power of plain\ntransformers is still under-explored. Second, they focus on simple and small\npoint clouds instead of complex real-world ones. This work revisits the plain\ntransformers in real-world point cloud understanding. We first take a closer\nlook at some fundamental components of plain transformers, e.g., patchifier and\npositional embedding, for both efficiency and performance. To close the\nperformance gap due to the lack of inductive bias and annotated data, we\ninvestigate self-supervised pre-training with masked autoencoder (MAE).\nSpecifically, we propose drop patch, which prevents information leakage and\nsignificantly improves the effectiveness of MAE. Our models achieve SOTA\nresults in semantic segmentation on the S3DIS dataset and object detection on\nthe ScanNet dataset with lower computational costs. Our work provides a new\nbaseline for future research on transformers for point clouds.\n","authors":["Lanxiao Li","Michael Heizmann"],"pdf_url":"https://arxiv.org/pdf/2303.00086v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09512v2","updated":"2023-08-06T13:36:24Z","published":"2023-05-16T15:04:01Z","title":"Light-VQA: A Multi-Dimensional Quality Assessment Model for Low-Light\n  Video Enhancement","summary":"  Recently, Users Generated Content (UGC) videos becomes ubiquitous in our\ndaily lives. However, due to the limitations of photographic equipments and\ntechniques, UGC videos often contain various degradations, in which one of the\nmost visually unfavorable effects is the underexposure. Therefore,\ncorresponding video enhancement algorithms such as Low-Light Video Enhancement\n(LLVE) have been proposed to deal with the specific degradation. However,\ndifferent from video enhancement algorithms, almost all existing Video Quality\nAssessment (VQA) models are built generally rather than specifically, which\nmeasure the quality of a video from a comprehensive perspective. To the best of\nour knowledge, there is no VQA model specially designed for videos enhanced by\nLLVE algorithms. To this end, we first construct a Low-Light Video Enhancement\nQuality Assessment (LLVE-QA) dataset in which 254 original low-light videos are\ncollected and then enhanced by leveraging 8 LLVE algorithms to obtain 2,060\nvideos in total. Moreover, we propose a quality assessment model specialized in\nLLVE, named Light-VQA. More concretely, since the brightness and noise have the\nmost impact on low-light enhanced VQA, we handcraft corresponding features and\nintegrate them with deep-learning-based semantic features as the overall\nspatial information. As for temporal information, in addition to\ndeep-learning-based motion features, we also investigate the handcrafted\nbrightness consistency among video frames, and the overall temporal information\nis their concatenation. Subsequently, spatial and temporal information is fused\nto obtain the quality-aware representation of a video. Extensive experimental\nresults show that our Light-VQA achieves the best performance against the\ncurrent State-Of-The-Art (SOTA) on LLVE-QA and public dataset. Dataset and\nCodes can be found at https://github.com/wenzhouyidu/Light-VQA.\n","authors":["Yunlong Dong","Xiaohong Liu","Yixuan Gao","Xunchu Zhou","Tao Tan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2305.09512v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04237v2","updated":"2023-08-06T13:32:46Z","published":"2023-06-07T08:28:38Z","title":"Randomized 3D Scene Generation for Generalizable Self-Supervised\n  Pre-Training","summary":"  Capturing and labeling real-world 3D data is laborious and time-consuming,\nwhich makes it costly to train strong 3D models. To address this issue, recent\nworks present a simple method by generating randomized 3D scenes without\nsimulation and rendering. Although models pre-trained on the generated\nsynthetic data gain impressive performance boosts, previous works have two\nmajor shortcomings. First, they focus on only one downstream task (i.e., object\ndetection), and the generalization to other tasks is unexplored. Second, the\ncontributions of generated data are not systematically studied. To obtain a\ndeeper understanding of the randomized 3D scene generation technique, we\nrevisit previous works and compare different data generation methods using a\nunified setup. Moreover, to clarify the generalization of the pre-trained\nmodels, we evaluate their performance in multiple tasks (i.e., object detection\nand semantic segmentation) and with different pre-training methods (i.e.,\nmasked autoencoder and contrastive learning). Moreover, we propose a new method\nto generate 3D scenes with spherical harmonics. It surpasses the previous\nformula-driven method with a clear margin and achieves on-par results with\nmethods using real-world scans and CAD models.\n","authors":["Lanxiao Li","Michael Heizmann"],"pdf_url":"https://arxiv.org/pdf/2306.04237v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03108v1","updated":"2023-08-06T13:29:42Z","published":"2023-08-06T13:29:42Z","title":"SAAM: Stealthy Adversarial Attack on Monoculor Depth Estimation","summary":"  In this paper, we investigate the vulnerability of MDE to adversarial\npatches. We propose a novel \\underline{S}tealthy \\underline{A}dversarial\n\\underline{A}ttacks on \\underline{M}DE (SAAM) that compromises MDE by either\ncorrupting the estimated distance or causing an object to seamlessly blend into\nits surroundings. Our experiments, demonstrate that the designed stealthy patch\nsuccessfully causes a DNN-based MDE to misestimate the depth of objects. In\nfact, our proposed adversarial patch achieves a significant 60\\% depth error\nwith 99\\% ratio of the affected region. Importantly, despite its adversarial\nnature, the patch maintains a naturalistic appearance, making it inconspicuous\nto human observers. We believe that this work sheds light on the threat of\nadversarial attacks in the context of MDE on edge devices. We hope it raises\nawareness within the community about the potential real-life harm of such\nattacks and encourages further research into developing more robust and\nadaptive defense mechanisms.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2308.03108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09042v2","updated":"2023-08-06T12:53:24Z","published":"2023-04-18T15:01:45Z","title":"Adapter Learning in Pretrained Feature Extractor for Continual Learning\n  of Diseases","summary":"  Currently intelligent diagnosis systems lack the ability of continually\nlearning to diagnose new diseases once deployed, under the condition of\npreserving old disease knowledge. In particular, updating an intelligent\ndiagnosis system with training data of new diseases would cause catastrophic\nforgetting of old disease knowledge. To address the catastrophic forgetting\nissue, an Adapter-based Continual Learning framework called ACL is proposed to\nhelp effectively learn a set of new diseases at each round (or task) of\ncontinual learning, without changing the shared feature extractor. The\nlearnable lightweight task-specific adapter(s) can be flexibly designed (e.g.,\ntwo convolutional layers) and then added to the pretrained and fixed feature\nextractor. Together with a specially designed task-specific head which absorbs\nall previously learned old diseases as a single \"out-of-distribution\" category,\ntask-specific adapter(s) can help the pretrained feature extractor more\neffectively extract discriminative features between diseases. In addition, a\nsimple yet effective fine-tuning is applied to collaboratively fine-tune\nmultiple task-specific heads such that outputs from different heads are\ncomparable and consequently the appropriate classifier head can be more\naccurately selected during model inference. Extensive empirical evaluations on\nthree image datasets demonstrate the superior performance of ACL in continual\nlearning of new diseases. The source code is available at\nhttps://github.com/GiantJun/CL_Pytorch.\n","authors":["Wentao Zhang","Yujun Huang","Tong Zhang","Qingsong Zou","Wei-Shi Zheng","Ruixuan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09042v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2305.11769v2","updated":"2023-08-06T12:29:21Z","published":"2023-05-19T15:54:40Z","title":"Enhancing Vision-Language Pre-Training with Jointly Learned Questioner\n  and Dense Captioner","summary":"  Large pre-trained multimodal models have demonstrated significant success in\na range of downstream tasks, including image captioning, image-text retrieval,\nvisual question answering (VQA), etc. However, many of these methods rely on\nimage-text pairs collected from the web as pre-training data and unfortunately\noverlook the need for fine-grained feature alignment between vision and\nlanguage modalities, which requires detailed understanding of images and\nlanguage expressions. While integrating VQA and dense captioning (DC) into\npre-training can address this issue, acquiring image-question-answer as well as\nimage-location-caption triplets is challenging and time-consuming.\nAdditionally, publicly available datasets for VQA and dense captioning are\ntypically limited in scale due to manual data collection and labeling efforts.\nIn this paper, we propose a novel method called Joint QA and DC GEneration\n(JADE), which utilizes a pre-trained multimodal model and easily-crawled\nimage-text pairs to automatically generate and filter large-scale VQA and dense\ncaptioning datasets. We apply this method to the Conceptual Caption (CC3M)\ndataset to generate a new dataset called CC3M-QA-DC. Experiments show that when\nused for pre-training in a multi-task manner, CC3M-QA-DC can improve the\nperformance with various backbones on various downstream tasks. Furthermore,\nour generated CC3M-QA-DC can be combined with larger image-text datasets (e.g.,\nCC15M) and achieve competitive results compared with models using much more\ndata. Code and dataset are available at\nhttps://github.com/johncaged/OPT_Questioner.\n","authors":["Zikang Liu","Sihan Chen","Longteng Guo","Handong Li","Xingjian He","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2305.11769v2.pdf","comment":"12 pages. Accepted by ACM MM '23"},{"id":"http://arxiv.org/abs/2308.03097v1","updated":"2023-08-06T12:23:40Z","published":"2023-08-06T12:23:40Z","title":"Incorporating Pre-training Data Matters in Unsupervised Domain\n  Adaptation","summary":"  Unsupervised domain adaptation(UDA) and Source-free UDA(SFUDA) methods\nformulate the problem involving two domains: source and target. They typically\nemploy a standard training approach that begins with models pre-trained on\nlarge-scale datasets e.g., ImageNet, while rarely discussing its effect.\nRecognizing this gap, we investigate the following research questions: (1) What\nis the correlation among ImageNet, the source, and the target domain? (2) How\ndoes pre-training on ImageNet influence the target risk? To answer the first\nquestion, we empirically observed an interesting Spontaneous Pulling (SP)\nEffect in fine-tuning where the discrepancies between any two of the three\ndomains (ImageNet, Source, Target) decrease but at the cost of the impaired\nsemantic structure of the pre-train domain. For the second question, we put\nforward a theory to explain SP and quantify that the target risk is bound by\ngradient disparities among the three domains. Our observations reveal a key\nlimitation of existing methods: it hinders the adaptation performance if the\nsemantic cluster structure of the pre-train dataset (i.e.ImageNet) is impaired.\nTo address it, we incorporate ImageNet as the third domain and redefine the\nUDA/SFUDA as a three-player game. Specifically, inspired by the theory and\nempirical findings, we present a novel framework termed TriDA which\nadditionally preserves the semantic structure of the pre-train dataset during\nfine-tuning. Experimental results demonstrate that it achieves state-of-the-art\nperformance across various UDA and SFUDA benchmarks.\n","authors":["Yinsong Xu","Aidong Men","Yang Liu","Qingchao Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03092v1","updated":"2023-08-06T11:37:55Z","published":"2023-08-06T11:37:55Z","title":"ECT: Fine-grained Edge Detection with Learned Cause Tokens","summary":"  In this study, we tackle the challenging fine-grained edge detection task,\nwhich refers to predicting specific edges caused by reflectance, illumination,\nnormal, and depth changes, respectively. Prior methods exploit multi-scale\nconvolutional networks, which are limited in three aspects: (1) Convolutions\nare local operators while identifying the cause of edge formation requires\nlooking at far away pixels. (2) Priors specific to edge cause are fixed in\nprediction heads. (3) Using separate networks for generic and fine-grained edge\ndetection, and the constraint between them may be violated. To address these\nthree issues, we propose a two-stage transformer-based network sequentially\npredicting generic edges and fine-grained edges, which has a global receptive\nfield thanks to the attention mechanism. The prior knowledge of edge causes is\nformulated as four learnable cause tokens in a cause-aware decoder design.\nFurthermore, to encourage the consistency between generic edges and\nfine-grained edges, an edge aggregation and alignment loss is exploited. We\nevaluate our method on the public benchmark BSDS-RIND and several newly derived\nbenchmarks, and achieve new state-of-the-art results. Our code, data, and\nmodels are publicly available at https://github.com/Daniellli/ECT.git.\n","authors":["Shaocong Xu","Xiaoxue Chen","Yuhang Zheng","Guyue Zhou","Yurong Chen","Hongbin Zha","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.03092v1.pdf","comment":"code available at https://github.com/Daniellli/ECT.git"},{"id":"http://arxiv.org/abs/2306.12435v2","updated":"2023-08-06T11:22:19Z","published":"2023-06-15T12:02:57Z","title":"Modeling T1 Resting-State MRI Variants Using Convolutional Neural\n  Networks in Diagnosis of OCD","summary":"  Obsessive-compulsive disorder (OCD) presents itself as a highly debilitating\ndisorder. The disorder has common associations with the prefrontal cortex and\nthe glutamate receptor known as Metabotropic Glutamate Receptor 5 (mGluR5).\nThis receptor has been observed to demonstrate higher levels of signaling from\npositron emission tomography scans measured by its distribution volume ratios\nin mice. Despite this evidence, studies are unable to fully verify the\ninvolvement of mGluR5 as more empirical data is needed. Computational modeling\nmethods were used as a means of validation for previous hypotheses involving\nmGluR5. The inadequacies in relation to the causal factor of OCD were answered\nby utilizing T1 resting-state magnetic resonance imaging (TRS-MRI) scans of\npatients suffering from schizophrenia, major depressive disorder, and\nobsessive-compulsive disorder. Because comorbid cases often occur within these\ndisorders, cross-comparative abilities become necessary to find distinctive\ncharacteristics. Two-dimensional convolutional neural networks alongside\nResNet50 and MobileNet models were constructed and evaluated for efficiency.\nActivation heatmaps of TRS-MRI scans were outputted, allowing for\ntranscriptomics analysis. Though, a lack of ability to predict OCD cases\nprevented gene expression analysis. Across all models, there was an 88.75%\nvalidation accuracy for MDD, and 82.08% validation accuracy for SZD under the\nframework of ResNet50 as well as novel computation. OCD yielded an accuracy\nrate of around 54.4%. These results provided further evidence for the p-factor\ntheory regarding mental disorders. Future work involves the application of\nalternate transfer learning networks than those used in this paper to bolster\naccuracy rates.\n","authors":["Tarun Eswar"],"pdf_url":"https://arxiv.org/pdf/2306.12435v2.pdf","comment":"9 pages, 13 figures"},{"id":"http://arxiv.org/abs/2210.09887v4","updated":"2023-08-06T10:57:04Z","published":"2022-10-18T14:23:05Z","title":"MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving\n  Camera Videos","summary":"  Convolutional neural network inference on video input is computationally\nexpensive and requires high memory bandwidth. Recently, DeltaCNN managed to\nreduce the cost by only processing pixels with significant updates over the\nprevious frame. However, DeltaCNN relies on static camera input. Moving cameras\nadd new challenges in how to fuse newly unveiled image regions with already\nprocessed regions efficiently to minimize the update rate - without increasing\nmemory overhead and without knowing the camera extrinsics of future frames. In\nthis work, we propose MotionDeltaCNN, a sparse CNN inference framework that\nsupports moving cameras. We introduce spherical buffers and padded convolutions\nto enable seamless fusion of newly unveiled regions and previously processed\nregions -- without increasing memory footprint. Our evaluation shows that we\noutperform DeltaCNN by up to 90% for moving camera videos.\n","authors":["Mathias Parger","Chengcheng Tang","Thomas Neff","Christopher D. Twigg","Cem Keskin","Robert Wang","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2210.09887v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03076v1","updated":"2023-08-06T10:08:50Z","published":"2023-08-06T10:08:50Z","title":"Study for Performance of MobileNetV1 and MobileNetV2 Based on Breast\n  Cancer","summary":"  Artificial intelligence is constantly evolving and can provide effective help\nin all aspects of people's lives. The experiment is mainly to study the use of\nartificial intelligence in the field of medicine. The purpose of this\nexperiment was to compare which of MobileNetV1 and MobileNetV2 models was\nbetter at detecting histopathological images of the breast downloaded at\nKaggle. When the doctor looks at the pathological image, there may be errors\nthat lead to errors in judgment, and the observation speed is slow. Rational\nuse of artificial intelligence can effectively reduce the error of doctor\ndiagnosis in breast cancer judgment and speed up doctor diagnosis. The dataset\nwas downloaded from Kaggle and then normalized. The basic principle of the\nexperiment is to let the neural network model learn the downloaded data set.\nThen find the pattern and be able to judge on your own whether breast tissue is\ncancer. In the dataset, benign tumor pictures and malignant tumor pictures have\nbeen classified, of which 198738 are benign tumor pictures and 78, 786 are\nmalignant tumor pictures. After calling MobileNetV1 and MobileNetV2, the\ndataset is trained separately, the training accuracy and validation accuracy\nrate are obtained, and the image is drawn. It can be observed that MobileNetV1\nhas better validation accuracy and overfit during MobileNetV2 training. From\nthe experimental results, it can be seen that in the case of processing this\ndataset, MobileNetV1 is much better than MobileNetV2.\n","authors":["Jiuqi Yan"],"pdf_url":"https://arxiv.org/pdf/2308.03076v1.pdf","comment":"5 pages,3 figures,CMLAI 2023"},{"id":"http://arxiv.org/abs/2207.04438v3","updated":"2023-08-06T10:00:43Z","published":"2022-07-10T11:18:26Z","title":"SRRT: Search Region Regulation Tracking","summary":"  The dominant trackers generate a fixed-size rectangular region based on the\nprevious prediction or initial bounding box as the model input, i.e., search\nregion. While this manner obtains promising tracking efficiency, a fixed-size\nsearch region lacks flexibility and is likely to fail in some cases, e.g., fast\nmotion and distractor interference. Trackers tend to lose the target object due\nto the limited search region or be interfered with by distractors due to the\nexcessive search region. Drawing inspiration from the pattern humans track an\nobject, we propose a novel tracking paradigm, called Search Region Regulation\nTracking (SRRT) that applies a small eyereach when the target is captured and\nzooms out the search field when the target is about to be lost. SRRT applies a\nproposed search region regulator to estimate an optimal search region\ndynamically for each frame, by which the tracker can flexibly respond to\ntransient changes in the location of object occurrences. To adapt the object's\nappearance variation during online tracking, we further propose a lockingstate\ndetermined updating strategy for reference frame updating. The proposed SRRT is\nconcise without bells and whistles, yet achieves evident improvements and\ncompetitive results with other state-of-the-art trackers on eight benchmarks.\nOn the large-scale LaSOT benchmark, SRRT improves SiamRPN++ and TransT with\nabsolute gains of 4.6% and 3.1% in terms of AUC. The code and models will be\nreleased.\n","authors":["Jiawen Zhu","Xin Chen","Pengyu Zhang","Xinying Wang","Dong Wang","Wenda Zhao","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2207.04438v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2302.10544v2","updated":"2023-08-06T09:51:02Z","published":"2023-02-21T09:18:57Z","title":"EC-SfM: Efficient Covisibility-based Structure-from-Motion for Both\n  Sequential and Unordered Images","summary":"  Structure-from-Motion is a technology used to obtain scene structure through\nimage collection, which is a fundamental problem in computer vision. For\nunordered Internet images, SfM is very slow due to the lack of prior knowledge\nabout image overlap. For sequential images, knowing the large overlap between\nadjacent frames, SfM can adopt a variety of acceleration strategies, which are\nonly applicable to sequential data. To further improve the reconstruction\nefficiency and break the gap of strategies between these two kinds of data,\nthis paper presents an efficient covisibility-based incremental SfM. Different\nfrom previous methods, we exploit covisibility and registration dependency to\ndescribe the image connection which is suitable to any kind of data. Based on\nthis general image connection, we propose a unified framework to efficiently\nreconstruct sequential images, unordered images, and the mixture of these two.\nExperiments on the unordered images and mixed data verify the effectiveness of\nthe proposed method, which is three times faster than the state of the art on\nfeature matching, and an order of magnitude faster on reconstruction without\nsacrificing the accuracy. The source code is publicly available at\nhttps://github.com/openxrlab/xrsfm\n","authors":["Zhichao Ye","Chong Bao","Xin Zhou","Haomin Liu","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.10544v2.pdf","comment":"Accepted 27 May 2023 (TCSVT)"},{"id":"http://arxiv.org/abs/2209.09841v2","updated":"2023-08-06T09:43:41Z","published":"2022-09-20T16:36:28Z","title":"Exploring Inconsistent Knowledge Distillation for Object Detection with\n  Data Augmentation","summary":"  Knowledge Distillation (KD) for object detection aims to train a compact\ndetector by transferring knowledge from a teacher model. Since the teacher\nmodel perceives data in a way different from humans, existing KD methods only\ndistill knowledge that is consistent with labels annotated by human expert\nwhile neglecting knowledge that is not consistent with human perception, which\nresults in insufficient distillation and sub-optimal performance. In this\npaper, we propose inconsistent knowledge distillation (IKD), which aims to\ndistill knowledge inherent in the teacher model's counter-intuitive\nperceptions. We start by considering the teacher model's counter-intuitive\nperceptions of frequency and non-robust features. Unlike previous works that\nexploit fine-grained features or introduce additional regularizations, we\nextract inconsistent knowledge by providing diverse input using data\naugmentation. Specifically, we propose a sample-specific data augmentation to\ntransfer the teacher model's ability in capturing distinct frequency components\nand suggest an adversarial feature augmentation to extract the teacher model's\nperceptions of non-robust features in the data. Extensive experiments\ndemonstrate the effectiveness of our method which outperforms state-of-the-art\nKD baselines on one-stage, two-stage and anchor-free object detectors (at most\n+1.0 mAP). Our codes will be made available at\n\\url{https://github.com/JWLiang007/IKD.git}.\n","authors":["Jiawei Liang","Siyuan Liang","Aishan Liu","Ke Ma","Jingzhi Li","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2209.09841v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.03063v1","updated":"2023-08-06T09:15:14Z","published":"2023-08-06T09:15:14Z","title":"M$^3$Net: Multi-view Encoding, Matching, and Fusion for Few-shot\n  Fine-grained Action Recognition","summary":"  Due to the scarcity of manually annotated data required for fine-grained\nvideo understanding, few-shot fine-grained (FS-FG) action recognition has\ngained significant attention, with the aim of classifying novel fine-grained\naction categories with only a few labeled instances. Despite the progress made\nin FS coarse-grained action recognition, current approaches encounter two\nchallenges when dealing with the fine-grained action categories: the inability\nto capture subtle action details and the insufficiency of learning from limited\ndata that exhibit high intra-class variance and inter-class similarity. To\naddress these limitations, we propose M$^3$Net, a matching-based framework for\nFS-FG action recognition, which incorporates \\textit{multi-view encoding},\n\\textit{multi-view matching}, and \\textit{multi-view fusion} to facilitate\nembedding encoding, similarity matching, and decision making across multiple\nviewpoints. \\textit{Multi-view encoding} captures rich contextual details from\nthe intra-frame, intra-video, and intra-episode perspectives, generating\ncustomized higher-order embeddings for fine-grained data. \\textit{Multi-view\nmatching} integrates various matching functions enabling flexible relation\nmodeling within limited samples to handle multi-scale spatio-temporal\nvariations by leveraging the instance-specific, category-specific, and\ntask-specific perspectives. \\textit{Multi-view fusion} consists of\nmatching-predictions fusion and matching-losses fusion over the above views,\nwhere the former promotes mutual complementarity and the latter enhances\nembedding generalizability by employing multi-task collaborative learning.\nExplainable visualizations and experimental results on three challenging\nbenchmarks demonstrate the superiority of M$^3$Net in capturing fine-grained\naction details and achieving state-of-the-art performance for FS-FG action\nrecognition.\n","authors":["Hao Tang","Jun Liu","Shuanglin Yan","Rui Yan","Zechao Li","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03063v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03061v1","updated":"2023-08-06T09:09:17Z","published":"2023-08-06T09:09:17Z","title":"InterTracker: Discovering and Tracking General Objects Interacting with\n  Hands in the Wild","summary":"  Understanding human interaction with objects is an important research topic\nfor embodied Artificial Intelligence and identifying the objects that humans\nare interacting with is a primary problem for interaction understanding.\nExisting methods rely on frame-based detectors to locate interacting objects.\nHowever, this approach is subjected to heavy occlusions, background clutter,\nand distracting objects. To address the limitations, in this paper, we propose\nto leverage spatio-temporal information of hand-object interaction to track\ninteractive objects under these challenging cases. Without prior knowledge of\nthe general objects to be tracked like object tracking problems, we first\nutilize the spatial relation between hands and objects to adaptively discover\nthe interacting objects from the scene. Second, the consistency and continuity\nof the appearance of objects between successive frames are exploited to track\nthe objects. With this tracking formulation, our method also benefits from\ntraining on large-scale general object-tracking datasets. We further curate a\nvideo-level hand-object interaction dataset for testing and evaluation from\n100DOH. The quantitative results demonstrate that our proposed method\noutperforms the state-of-the-art methods. Specifically, in scenes with\ncontinuous interaction with different objects, we achieve an impressive\nimprovement of about 10% as evaluated using the Average Precision (AP) metric.\nOur qualitative findings also illustrate that our method can produce more\ncontinuous trajectories for interacting objects.\n","authors":["Yanyan Shao","Qi Ye","Wenhan Luo","Kaihao Zhang","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03061v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2308.03060v1","updated":"2023-08-06T09:08:37Z","published":"2023-08-06T09:08:37Z","title":"TOPIQ: A Top-down Approach from Semantics to Distortions for Image\n  Quality Assessment","summary":"  Image Quality Assessment (IQA) is a fundamental task in computer vision that\nhas witnessed remarkable progress with deep neural networks. Inspired by the\ncharacteristics of the human visual system, existing methods typically use a\ncombination of global and local representations (\\ie, multi-scale features) to\nachieve superior performance. However, most of them adopt simple linear fusion\nof multi-scale features, and neglect their possibly complex relationship and\ninteraction. In contrast, humans typically first form a global impression to\nlocate important regions and then focus on local details in those regions. We\ntherefore propose a top-down approach that uses high-level semantics to guide\nthe IQA network to focus on semantically important local distortion regions,\nnamed as \\emph{TOPIQ}. Our approach to IQA involves the design of a heuristic\ncoarse-to-fine network (CFANet) that leverages multi-scale features and\nprogressively propagates multi-level semantic information to low-level\nrepresentations in a top-down manner. A key component of our approach is the\nproposed cross-scale attention mechanism, which calculates attention maps for\nlower level features guided by higher level features. This mechanism emphasizes\nactive semantic regions for low-level distortions, thereby improving\nperformance. CFANet can be used for both Full-Reference (FR) and No-Reference\n(NR) IQA. We use ResNet50 as its backbone and demonstrate that CFANet achieves\nbetter or competitive performance on most public FR and NR benchmarks compared\nwith state-of-the-art methods based on vision transformers, while being much\nmore efficient (with only ${\\sim}13\\%$ FLOPS of the current best FR method).\nCodes are released at \\url{https://github.com/chaofengc/IQA-PyTorch}.\n","authors":["Chaofeng Chen","Jiadi Mo","Jingwen Hou","Haoning Wu","Liang Liao","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03060v1.pdf","comment":"13 pages, 8 figures, 10 tables. In submission"},{"id":"http://arxiv.org/abs/2308.03059v1","updated":"2023-08-06T08:53:49Z","published":"2023-08-06T08:53:49Z","title":"Language-based Photo Color Adjustment for Graphic Designs","summary":"  Adjusting the photo color to associate with some design elements is an\nessential way for a graphic design to effectively deliver its message and make\nit aesthetically pleasing. However, existing tools and previous works face a\ndilemma between the ease of use and level of expressiveness. To this end, we\nintroduce an interactive language-based approach for photo recoloring, which\nprovides an intuitive system that can assist both experts and novices on\ngraphic design. Given a graphic design containing a photo that needs to be\nrecolored, our model can predict the source colors and the target regions, and\nthen recolor the target regions with the source colors based on the given\nlanguage-based instruction. The multi-granularity of the instruction allows\ndiverse user intentions. The proposed novel task faces several unique\nchallenges, including: 1) color accuracy for recoloring with exactly the same\ncolor from the target design element as specified by the user; 2)\nmulti-granularity instructions for parsing instructions correctly to generate a\nspecific result or multiple plausible ones; and 3) locality for recoloring in\nsemantically meaningful local regions to preserve original image semantics. To\naddress these challenges, we propose a model called LangRecol with two main\ncomponents: the language-based source color prediction module and the\nsemantic-palette-based photo recoloring module. We also introduce an approach\nfor generating a synthetic graphic design dataset with instructions to enable\nmodel training. We evaluate our model via extensive experiments and user\nstudies. We also discuss several practical applications, showing the\neffectiveness and practicality of our approach. Code and data for this paper\nare at: https://zhenwwang.github.io/langrecol.\n","authors":["Zhenwei Wang","Nanxuan Zhao","Gerhard Hancke","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2308.03059v1.pdf","comment":"15 pages, 19 figures. Accepted by SIGGRAPH 2023. Project page:\n  https://zhenwwang.github.io/langrecol"},{"id":"http://arxiv.org/abs/2307.14981v2","updated":"2023-08-06T08:35:52Z","published":"2023-07-27T16:19:12Z","title":"MapNeRF: Incorporating Map Priors into Neural Radiance Fields for\n  Driving View Simulation","summary":"  Simulating camera sensors is a crucial task in autonomous driving. Although\nneural radiance fields are exceptional at synthesizing photorealistic views in\ndriving simulations, they still fail to generate extrapolated views. This paper\nproposes to incorporate map priors into neural radiance fields to synthesize\nout-of-trajectory driving views with semantic road consistency. The key insight\nis that map information can be utilized as a prior to guiding the training of\nthe radiance fields with uncertainty. Specifically, we utilize the coarse\nground surface as uncertain information to supervise the density field and warp\ndepth with uncertainty from unknown camera poses to ensure multi-view\nconsistency. Experimental results demonstrate that our approach can produce\nsemantic consistency in deviated views for vehicle camera simulation. The\nsupplementary video can be viewed at https://youtu.be/jEQWr-Rfh3A.\n","authors":["Chenming Wu","Jiadai Sun","Zhelun Shen","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.14981v2.pdf","comment":"Accepted by IEEE/RSJ International Conference on Intelligent Robots\n  and Systems (IROS) 2023"},{"id":"http://arxiv.org/abs/2308.03048v1","updated":"2023-08-06T08:22:39Z","published":"2023-08-06T08:22:39Z","title":"Multi-scale Alternated Attention Transformer for Generalized Stereo\n  Matching","summary":"  Recent stereo matching networks achieves dramatic performance by introducing\nepipolar line constraint to limit the matching range of dual-view. However, in\ncomplicated real-world scenarios, the feature information based on\nintra-epipolar line alone is too weak to facilitate stereo matching. In this\npaper, we present a simple but highly effective network called Alternated\nAttention U-shaped Transformer (AAUformer) to balance the impact of epipolar\nline in dual and single view respectively for excellent generalization\nperformance. Compared to other models, our model has several main designs: 1)\nto better liberate the local semantic features of the single-view at pixel\nlevel, we introduce window self-attention to break the limits of intra-row\nself-attention and completely replace the convolutional network for denser\nfeatures before cross-matching; 2) the multi-scale alternated attention\nbackbone network was designed to extract invariant features in order to\nachieves the coarse-to-fine matching process for hard-to-discriminate regions.\nWe performed a series of both comparative studies and ablation studies on\nseveral mainstream stereo matching datasets. The results demonstrate that our\nmodel achieves state-of-the-art on the Scene Flow dataset, and the fine-tuning\nperformance is competitive on the KITTI 2015 dataset. In addition, for cross\ngeneralization experiments on synthetic and real-world datasets, our model\noutperforms several state-of-the-art works.\n","authors":["Wei Miao","Hong Zhao","Tongjia Chen","Wei Huang","Changyan Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.03048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03047v1","updated":"2023-08-06T08:17:40Z","published":"2023-08-06T08:17:40Z","title":"Prototypes-oriented Transductive Few-shot Learning with Conditional\n  Transport","summary":"  Transductive Few-Shot Learning (TFSL) has recently attracted increasing\nattention since it typically outperforms its inductive peer by leveraging\nstatistics of query samples. However, previous TFSL methods usually encode\nuniform prior that all the classes within query samples are equally likely,\nwhich is biased in imbalanced TFSL and causes severe performance degradation.\n  Given this pivotal issue, in this work, we propose a novel Conditional\nTransport (CT) based imbalanced TFSL model called {\\textbf P}rototypes-oriented\n{\\textbf U}nbiased {\\textbf T}ransfer {\\textbf M}odel (PUTM) to fully exploit\nunbiased statistics of imbalanced query samples, which employs forward and\nbackward navigators as transport matrices to balance the prior of query samples\nper class between uniform and adaptive data-driven distributions. For\nefficiently transferring statistics learned by CT, we further derive a closed\nform solution to refine prototypes based on MAP given the learned navigators.\nThe above two steps of discovering and transferring unbiased statistics follow\nan iterative manner, formulating our EM-based solver.\n  Experimental results on four standard benchmarks including miniImageNet,\ntieredImageNet, CUB, and CIFAR-FS demonstrate superiority of our model in\nclass-imbalanced generalization.\n","authors":["Long Tian","Jingyi Feng","Wenchao Chen","Xiaoqiang Chai","Liming Wang","Xiyang Liu","Bo Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03047v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2304.13923v2","updated":"2023-08-06T08:06:43Z","published":"2023-04-27T02:23:47Z","title":"Retrieval-based Knowledge Augmented Vision Language Pre-training","summary":"  With the recent progress in large-scale vision and language representation\nlearning, Vision Language Pre-training (VLP) models have achieved promising\nimprovements on various multi-modal downstream tasks. Albeit powerful, these\nmodels have not fully leveraged world knowledge to their advantage. A key\nchallenge of knowledge-augmented VLP is the lack of clear connections between\nknowledge and multi-modal data. Moreover, not all knowledge present in\nimages/texts is useful, therefore prior approaches often struggle to\neffectively integrate knowledge, visual, and textual information. In this\nstudy, we propose REtrieval-based knowledge Augmented Vision Language (REAVL),\na novel knowledge-augmented pre-training framework to address the above issues.\nFor the first time, we introduce a knowledge-aware self-supervised learning\nscheme that efficiently establishes the correspondence between knowledge and\nmulti-modal data and identifies informative knowledge to improve the modeling\nof alignment and interactions between visual and textual modalities. By\nadaptively integrating informative knowledge with visual and textual\ninformation, REAVL achieves new state-of-the-art performance uniformly on\nknowledge-based vision-language understanding and multi-modal entity linking\ntasks, as well as competitive results on general vision-language tasks while\nonly using 0.2% pre-training data of the best models. Our model shows strong\nsample efficiency and effective knowledge utilization.\n","authors":["Jiahua Rao","Zifei Shan","Longpo Liu","Yao Zhou","Yuedong Yang"],"pdf_url":"https://arxiv.org/pdf/2304.13923v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.09338 by other authors"},{"id":"http://arxiv.org/abs/2308.03040v1","updated":"2023-08-06T07:27:17Z","published":"2023-08-06T07:27:17Z","title":"Learning Fine-Grained Features for Pixel-wise Video Correspondences","summary":"  Video analysis tasks rely heavily on identifying the pixels from different\nframes that correspond to the same visual target. To tackle this problem,\nrecent studies have advocated feature learning methods that aim to learn\ndistinctive representations to match the pixels, especially in a\nself-supervised fashion. Unfortunately, these methods have difficulties for\ntiny or even single-pixel visual targets. Pixel-wise video correspondences were\ntraditionally related to optical flows, which however lead to deterministic\ncorrespondences and lack robustness on real-world videos. We address the\nproblem of learning features for establishing pixel-wise correspondences.\nMotivated by optical flows as well as the self-supervised feature learning, we\npropose to use not only labeled synthetic videos but also unlabeled real-world\nvideos for learning fine-grained representations in a holistic framework. We\nadopt an adversarial learning scheme to enhance the generalization ability of\nthe learned features. Moreover, we design a coarse-to-fine framework to pursue\nhigh computational efficiency. Our experimental results on a series of\ncorrespondence-based tasks demonstrate that the proposed method outperforms\nstate-of-the-art rivals in both accuracy and efficiency.\n","authors":["Rui Li","Shenglong Zhou","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03040v1.pdf","comment":"Accepted to ICCV 2023. Code and models are available at\n  https://github.com/qianduoduolr/FGVC"},{"id":"http://arxiv.org/abs/2308.03033v1","updated":"2023-08-06T06:14:14Z","published":"2023-08-06T06:14:14Z","title":"FourLLIE: Boosting Low-Light Image Enhancement by Fourier Frequency\n  Information","summary":"  Recently, Fourier frequency information has attracted much attention in\nLow-Light Image Enhancement (LLIE). Some researchers noticed that, in the\nFourier space, the lightness degradation mainly exists in the amplitude\ncomponent and the rest exists in the phase component. By incorporating both the\nFourier frequency and the spatial information, these researchers proposed\nremarkable solutions for LLIE. In this work, we further explore the positive\ncorrelation between the magnitude of amplitude and the magnitude of lightness,\nwhich can be effectively leveraged to improve the lightness of low-light images\nin the Fourier space. Moreover, we find that the Fourier transform can extract\nthe global information of the image, and does not introduce massive neural\nnetwork parameters like Multi-Layer Perceptrons (MLPs) or Transformer. To this\nend, a two-stage Fourier-based LLIE network (FourLLIE) is proposed. In the\nfirst stage, we improve the lightness of low-light images by estimating the\namplitude transform map in the Fourier space. In the second stage, we introduce\nthe Signal-to-Noise-Ratio (SNR) map to provide the prior for integrating the\nglobal Fourier frequency and the local spatial information, which recovers\nimage details in the spatial space. With this ingenious design, FourLLIE\noutperforms the existing state-of-the-art (SOTA) LLIE methods on four\nrepresentative datasets while maintaining good model efficiency.\n","authors":["Chenxi Wang","Hongjun Wu","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2308.03033v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.03186v1","updated":"2023-08-06T18:35:54Z","published":"2023-08-06T18:35:54Z","title":"A Lightweight Method for Modeling Confidence in Recommendations with\n  Learned Beta Distributions","summary":"  Most Recommender Systems (RecSys) do not provide an indication of confidence\nin their decisions. Therefore, they do not distinguish between recommendations\nof which they are certain, and those where they are not. Existing confidence\nmethods for RecSys are either inaccurate heuristics, conceptually complex or\ncomputationally very expensive. Consequently, real-world RecSys applications\nrarely adopt these methods, and thus, provide no confidence insights in their\nbehavior. In this work, we propose learned beta distributions (LBD) as a simple\nand practical recommendation method with an explicit measure of confidence. Our\nmain insight is that beta distributions predict user preferences as probability\ndistributions that naturally model confidence on a closed interval, yet can be\nimplemented with the minimal model-complexity. Our results show that LBD\nmaintains competitive accuracy to existing methods while also having a\nsignificantly stronger correlation between its accuracy and confidence.\nFurthermore, LBD has higher performance when applied to a high-precision\ntargeted recommendation task. Our work thus shows that confidence in RecSys is\npossible without sacrificing simplicity or accuracy, and without introducing\nheavy computational complexity. Thereby, we hope it enables better insight into\nreal-world RecSys and opens the door for novel future applications.\n","authors":["Norman Knyazev","Harrie Oosterhuis"],"pdf_url":"https://arxiv.org/pdf/2308.03186v1.pdf","comment":"In Proceedings of the 17th ACM Conference on Recommender Systems\n  (RecSys '23), September 18-22, 2023, Singapore, Singapore. ACM, New York, NY,\n  USA, 12 pages"},{"id":"http://arxiv.org/abs/2308.03113v1","updated":"2023-08-06T13:39:23Z","published":"2023-08-06T13:39:23Z","title":"Semantic-Guided Feature Distillation for Multimodal Recommendation","summary":"  Multimodal recommendation exploits the rich multimodal information associated\nwith users or items to enhance the representation learning for better\nperformance. In these methods, end-to-end feature extractors (e.g.,\nshallow/deep neural networks) are often adopted to tailor the generic\nmultimodal features that are extracted from raw data by pre-trained models for\nrecommendation. However, compact extractors, such as shallow neural networks,\nmay find it challenging to extract effective information from complex and\nhigh-dimensional generic modality features. Conversely, DNN-based extractors\nmay encounter the data sparsity problem in recommendation. To address this\nproblem, we propose a novel model-agnostic approach called Semantic-guided\nFeature Distillation (SGFD), which employs a teacher-student framework to\nextract feature for multimodal recommendation. The teacher model first extracts\nrich modality features from the generic modality feature by considering both\nthe semantic information of items and the complementary information of multiple\nmodalities. SGFD then utilizes response-based and feature-based distillation\nloss to effectively transfer the knowledge encoded in the teacher model to the\nstudent model. To evaluate the effectiveness of our SGFD, we integrate SGFD\ninto three backbone multimodal recommendation models. Extensive experiments on\nthree public real-world datasets demonstrate that SGFD-enhanced models can\nachieve substantial improvement over their counterparts.\n","authors":["Fan Liu","Huilin Chen","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2308.03113v1.pdf","comment":"ACM Multimedia 2023 Accepted"},{"id":"http://arxiv.org/abs/2308.03103v1","updated":"2023-08-06T12:40:58Z","published":"2023-08-06T12:40:58Z","title":"Improving Domain-Specific Retrieval by NLI Fine-Tuning","summary":"  The aim of this article is to investigate the fine-tuning potential of\nnatural language inference (NLI) data to improve information retrieval and\nranking. We demonstrate this for both English and Polish languages, using data\nfrom one of the largest Polish e-commerce sites and selected open-domain\ndatasets. We employ both monolingual and multilingual sentence encoders\nfine-tuned by a supervised method utilizing contrastive loss and NLI data. Our\nresults point to the fact that NLI fine-tuning increases the performance of the\nmodels in both tasks and both languages, with the potential to improve mono-\nand multilingual models. Finally, we investigate uniformity and alignment of\nthe embeddings to explain the effect of NLI-based fine-tuning for an\nout-of-domain use-case.\n","authors":["Roman Dušek","Aleksander Wawer","Christopher Galias","Lidia Wojciechowska"],"pdf_url":"https://arxiv.org/pdf/2308.03103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03096v1","updated":"2023-08-06T12:22:12Z","published":"2023-08-06T12:22:12Z","title":"Gradient Coding through Iterative Block Leverage Score Sampling","summary":"  We generalize the leverage score sampling sketch for $\\ell_2$-subspace\nembeddings, to accommodate sampling subsets of the transformed data, so that\nthe sketching approach is appropriate for distributed settings. This is then\nused to derive an approximate coded computing approach for first-order methods;\nknown as gradient coding, to accelerate linear regression in the presence of\nfailures in distributed computational networks, \\textit{i.e.} stragglers. We\nreplicate the data across the distributed network, to attain the approximation\nguarantees through the induced sampling distribution. The significance and main\ncontribution of this work, is that it unifies randomized numerical linear\nalgebra with approximate coded computing, while attaining an induced\n$\\ell_2$-subspace embedding through uniform sampling. The transition to uniform\nsampling is done without applying a random projection, as in the case of the\nsubsampled randomized Hadamard transform. Furthermore, by incorporating this\ntechnique to coded computing, our scheme is an iterative sketching approach to\napproximately solving linear regression. We also propose weighting when\nsketching takes place through sampling with replacement, for further\ncompression.\n","authors":["Neophytos Charalambides","Mert Pilanci","Alfred Hero"],"pdf_url":"https://arxiv.org/pdf/2308.03096v1.pdf","comment":"26 pages, 6 figures, 1 table,"},{"id":"http://arxiv.org/abs/2308.03083v1","updated":"2023-08-06T10:25:31Z","published":"2023-08-06T10:25:31Z","title":"Predicting Group Choices from Group Profiles","summary":"  Group recommender systems (GRS) identify items to recommend to a group by\naggregating group members' individual preferences into a group profile. The\npreference aggregation strategy used to build the group profile can also be\nused for predicting the item that a group may decide to choose, i.e., by\nassuming that the group is applying exactly that strategy. However, predicting\nthe choice of a group is challenging since the RS is not aware of the precise\npreference aggregation strategy that is going to be used by the group. Hence,\nthe aim of this paper is to validate the research hypothesis that, by using a\nmachine learning approach and a data set of observed group choices, it is\npossible to predict a group's final choice, better than by using a standard\npreference aggregation strategy. Inspired by Social Decision Scheme theory,\nwhich first tried to address the group choice prediction problem, we search for\na group profile definition that, in conjunction with a machine learning model,\ncan be used to accurately predict a group choice. Moreover, to cope with the\ndata scarcity problem, we propose two data augmentation methods, which add\nsynthetic group profiles to the training data, and we hypothesise they can\nfurther improve the choice prediction accuracy. We validate our research\nhypotheses by using a data set containing 282 participants organized in 79\ngroups. The experiments indicate that the proposed methods outperform baseline\naggregation strategies when used for group choice prediction. The proposed\nmethod is robust with the presence of missing preference data and achieves a\nperformance superior to what human can achieve on the group choice prediction\ntask. Finally, the proposed data augmentation method can also improve the\nprediction accuracy. Our approach can be exploited in novel GRSs to identify\nthe items that the group is likely to choose and help the group to make a\nbetter choice.\n","authors":["Hanif Emamgholizadeh","Amra Delic","Francesco Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.03083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10029v2","updated":"2023-08-06T02:27:05Z","published":"2023-06-06T08:42:56Z","title":"Pseudo Session-Based Recommendation with Hierarchical Embedding and\n  Session Attributes","summary":"  Recently, electronic commerce (EC) websites have been unable to provide an\nidentification number (user ID) for each transaction data entry because of\nprivacy issues. Because most recommendation methods assume that all data are\nassigned a user ID, they cannot be applied to the data without user IDs.\nRecently, session-based recommendation (SBR) based on session information,\nwhich is short-term behavioral information of users, has been studied. A\ngeneral SBR uses only information about the item of interest to make a\nrecommendation (e.g., item ID for an EC site). Particularly in the case of EC\nsites, the data recorded include the name of the item being purchased, the\nprice of the item, the category hierarchy, and the gender and region of the\nuser. In this study, we define a pseudo--session for the purchase history data\nof an EC site without user IDs and session IDs. Finally, we propose an SBR with\na co-guided heterogeneous hypergraph and globalgraph network plus, called\nCoHHGN+. The results show that our CoHHGN+ can recommend items with higher\nperformance than other methods.\n","authors":["Yuta Sumiya","Ryusei Numata","Satoshi Takahashi"],"pdf_url":"https://arxiv.org/pdf/2306.10029v2.pdf","comment":"15 pages, 1 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.02984v1","updated":"2023-08-06T01:38:40Z","published":"2023-08-06T01:38:40Z","title":"Decision Knowledge Graphs: Construction of and Usage in Question\n  Answering for Clinical Practice Guidelines","summary":"  In the medical domain, several disease treatment procedures have been\ndocumented properly as a set of instructions known as Clinical Practice\nGuidelines (CPGs). CPGs have been developed over the years on the basis of past\ntreatments, and are updated frequently. A doctor treating a particular patient\ncan use these CPGs to know how past patients with similar conditions were\ntreated successfully and can find the recommended treatment procedure. In this\npaper, we present a Decision Knowledge Graph (DKG) representation to store CPGs\nand to perform question-answering on CPGs. CPGs are very complex and no\nexisting representation is suitable to perform question-answering and searching\ntasks on CPGs. As a result, doctors and practitioners have to manually wade\nthrough the guidelines, which is inefficient. Representation of CPGs is\nchallenging mainly due to frequent updates on CPGs and decision-based\nstructure. Our proposed DKG has a decision dimension added to a Knowledge Graph\n(KG) structure, purported to take care of decision based behavior of CPGs.\nUsing this DKG has shown 40\\% increase in accuracy compared to fine-tuned\nBioBert model in performing question-answering on CPGs. To the best of our\nknowledge, ours is the first attempt at creating DKGs and using them for\nrepresenting CPGs.\n","authors":["Vasudhan Varma Kandula","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2308.02984v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2205.09702v6","updated":"2023-08-06T23:43:47Z","published":"2022-05-19T17:11:45Z","title":"Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency\n  Analysis","summary":"  Graph neural networks (GNNs) are among the most powerful tools in deep\nlearning. They routinely solve complex problems on unstructured networks, such\nas node classification, graph classification, or link prediction, with high\naccuracy. However, both inference and training of GNNs are complex, and they\nuniquely combine the features of irregular graph processing with dense and\nregular computations. This complexity makes it very challenging to execute GNNs\nefficiently on modern massively parallel architectures. To alleviate this, we\nfirst design a taxonomy of parallelism in GNNs, considering data and model\nparallelism, and different forms of pipelining. Then, we use this taxonomy to\ninvestigate the amount of parallelism in numerous GNN models, GNN-driven\nmachine learning tasks, software frameworks, or hardware accelerators. We use\nthe work-depth model, and we also assess communication volume and\nsynchronization. We specifically focus on the sparsity/density of the\nassociated tensors, in order to understand how to effectively apply techniques\nsuch as vectorization. We also formally analyze GNN pipelining, and we\ngeneralize the established Message-Passing class of GNN models to cover\narbitrary pipeline depths, facilitating future optimizations. Finally, we\ninvestigate different forms of asynchronicity, navigating the path for future\nasynchronous parallel GNN pipelines. The outcomes of our analysis are\nsynthesized in a set of insights that help to maximize GNN performance, and a\ncomprehensive list of challenges and opportunities for further research into\nefficient GNN computations. Our work will help to advance the design of future\nGNNs.\n","authors":["Maciej Besta","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2205.09702v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03228v1","updated":"2023-08-06T23:41:14Z","published":"2023-08-06T23:41:14Z","title":"Why Linguistics Will Thrive in the 21st Century: A Reply to Piantadosi\n  (2023)","summary":"  We present a critical assessment of Piantadosi's (2023) claim that \"Modern\nlanguage models refute Chomsky's approach to language,\" focusing on four main\npoints. First, despite the impressive performance and utility of large language\nmodels (LLMs), humans achieve their capacity for language after exposure to\nseveral orders of magnitude less data. The fact that young children become\ncompetent, fluent speakers of their native languages with relatively little\nexposure to them is the central mystery of language learning to which Chomsky\ninitially drew attention, and LLMs currently show little promise of solving\nthis mystery. Second, what can the artificial reveal about the natural? Put\nsimply, the implications of LLMs for our understanding of the cognitive\nstructures and mechanisms underlying language and its acquisition are like the\nimplications of airplanes for understanding how birds fly. Third, LLMs cannot\nconstitute scientific theories of language for several reasons, not least of\nwhich is that scientific theories must provide interpretable explanations, not\njust predictions. This leads to our final point: to even determine whether the\nlinguistic and cognitive capabilities of LLMs rival those of humans requires\nexplicating what humans' capacities actually are. In other words, it requires a\nseparate theory of language and cognition; generative linguistics provides\nprecisely such a theory. As such, we conclude that generative linguistics as a\nscientific discipline will remain indispensable throughout the 21st century and\nbeyond.\n","authors":["Jordan Kodner","Sarah Payne","Jeffrey Heinz"],"pdf_url":"https://arxiv.org/pdf/2308.03228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03215v1","updated":"2023-08-06T21:54:07Z","published":"2023-08-06T21:54:07Z","title":"The Effect of SGD Batch Size on Autoencoder Learning: Sparsity,\n  Sharpness, and Feature Learning","summary":"  In this work, we investigate the dynamics of stochastic gradient descent\n(SGD) when training a single-neuron autoencoder with linear or ReLU activation\non orthogonal data. We show that for this non-convex problem, randomly\ninitialized SGD with a constant step size successfully finds a global minimum\nfor any batch size choice. However, the particular global minimum found depends\nupon the batch size. In the full-batch setting, we show that the solution is\ndense (i.e., not sparse) and is highly aligned with its initialized direction,\nshowing that relatively little feature learning occurs. On the other hand, for\nany batch size strictly smaller than the number of samples, SGD finds a global\nminimum which is sparse and nearly orthogonal to its initialization, showing\nthat the randomness of stochastic gradients induces a qualitatively different\ntype of \"feature selection\" in this setting. Moreover, if we measure the\nsharpness of the minimum by the trace of the Hessian, the minima found with\nfull batch gradient descent are flatter than those found with strictly smaller\nbatch sizes, in contrast to previous works which suggest that large batches\nlead to sharper minima. To prove convergence of SGD with a constant step size,\nwe introduce a powerful tool from the theory of non-homogeneous random walks\nwhich may be of independent interest.\n","authors":["Nikhil Ghosh","Spencer Frei","Wooseok Ha","Bin Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03212v1","updated":"2023-08-06T21:23:22Z","published":"2023-08-06T21:23:22Z","title":"Average-Hard Attention Transformers are Constant-Depth Uniform Threshold\n  Circuits","summary":"  Transformers have emerged as a widely used neural network model for various\nnatural language processing tasks. Previous research explored their\nrelationship with constant-depth threshold circuits, making two assumptions:\naverage-hard attention and logarithmic precision for internal computations\nrelative to input length. Merrill et al. (2022) prove that average-hard\nattention transformers recognize languages that fall within the complexity\nclass TC0, denoting the set of languages that can be recognized by\nconstant-depth polynomial-size threshold circuits. Likewise, Merrill and\nSabharwal (2023) show that log-precision transformers recognize languages\nwithin the class of uniform TC0. This shows that both transformer models can be\nsimulated by constant-depth threshold circuits, with the latter being more\nrobust due to generating a uniform circuit family. Our paper shows that the\nfirst result can be extended to yield uniform circuits as well.\n","authors":["Lena Strobl"],"pdf_url":"https://arxiv.org/pdf/2308.03212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05396v3","updated":"2023-08-06T21:18:33Z","published":"2023-03-08T14:55:46Z","title":"Bounding the Probabilities of Benefit and Harm Through Sensitivity\n  Parameters and Proxies","summary":"  We present two methods for bounding the probabilities of benefit and harm\nunder unmeasured confounding. The first method computes the (upper or lower)\nbound of either probability as a function of the observed data distribution and\ntwo intuitive sensitivity parameters which, then, can be presented to the\nanalyst as a 2-D plot to assist her in decision making. The second method\nassumes the existence of a measured nondifferential proxy (i.e., direct effect)\nof the unmeasured confounder. Using this proxy, tighter bounds than the\nexisting ones can be derived from just the observed data distribution.\n","authors":["Jose M. Peña"],"pdf_url":"https://arxiv.org/pdf/2303.05396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03210v1","updated":"2023-08-06T21:10:30Z","published":"2023-08-06T21:10:30Z","title":"Time-Parameterized Convolutional Neural Networks for Irregularly Sampled\n  Time Series","summary":"  Irregularly sampled multivariate time series are ubiquitous in several\napplication domains, leading to sparse, not fully-observed and non-aligned\nobservations across different variables. Standard sequential neural network\narchitectures, such as recurrent neural networks (RNNs) and convolutional\nneural networks (CNNs), consider regular spacing between observation times,\nposing significant challenges to irregular time series modeling. While most of\nthe proposed architectures incorporate RNN variants to handle irregular time\nintervals, convolutional neural networks have not been adequately studied in\nthe irregular sampling setting. In this paper, we parameterize convolutional\nlayers by employing time-explicitly initialized kernels. Such general functions\nof time enhance the learning process of continuous-time hidden dynamics and can\nbe efficiently incorporated into convolutional kernel weights. We, thus,\npropose the time-parameterized convolutional neural network (TPCNN), which\nshares similar properties with vanilla convolutions but is carefully designed\nfor irregularly sampled time series. We evaluate TPCNN on both interpolation\nand classification tasks involving real-world irregularly sampled multivariate\ntime series datasets. Our experimental results indicate the competitive\nperformance of the proposed TPCNN model which is also significantly more\nefficient than other state-of-the-art methods. At the same time, the proposed\narchitecture allows the interpretability of the input series by leveraging the\ncombination of learnable time functions that improve the network performance in\nsubsequent tasks and expedite the inaugural application of convolutions in this\nfield.\n","authors":["Chrysoula Kosma","Giannis Nikolentzos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2308.03210v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2101.10318 by other authors"},{"id":"http://arxiv.org/abs/2305.06329v2","updated":"2023-08-06T21:08:38Z","published":"2023-05-10T17:33:48Z","title":"Similarity of Neural Network Models: A Survey of Functional and\n  Representational Measures","summary":"  Measuring similarity of neural networks to understand and improve their\nbehavior has become an issue of great importance and research interest. In this\nsurvey, we provide a comprehensive overview of two complementary perspectives\nof measuring neural network similarity: (i) representational similarity, which\nconsiders how activations of intermediate layers differ, and (ii) functional\nsimilarity, which considers how models differ in their outputs. In addition to\nproviding detailed descriptions of existing measures, we summarize and discuss\nresults on the properties of and relationships between these measures, and\npoint to open research problems. We hope our work lays a foundation for more\nsystematic research on the properties and applicability of similarity measures\nfor neural network models.\n","authors":["Max Klabunde","Tobias Schumacher","Markus Strohmaier","Florian Lemmerich"],"pdf_url":"https://arxiv.org/pdf/2305.06329v2.pdf","comment":"Comments welcome!"},{"id":"http://arxiv.org/abs/2308.03209v1","updated":"2023-08-06T21:04:58Z","published":"2023-08-06T21:04:58Z","title":"Communication-Free Distributed GNN Training with Vertex Cut","summary":"  Training Graph Neural Networks (GNNs) on real-world graphs consisting of\nbillions of nodes and edges is quite challenging, primarily due to the\nsubstantial memory needed to store the graph and its intermediate node and edge\nfeatures, and there is a pressing need to speed up the training process. A\ncommon approach to achieve speed up is to divide the graph into many smaller\nsubgraphs, which are then distributed across multiple GPUs in one or more\nmachines and processed in parallel. However, existing distributed methods\nrequire frequent and substantial cross-GPU communication, leading to\nsignificant time overhead and progressively diminishing scalability. Here, we\nintroduce CoFree-GNN, a novel distributed GNN training framework that\nsignificantly speeds up the training process by implementing communication-free\ntraining. The framework utilizes a Vertex Cut partitioning, i.e., rather than\npartitioning the graph by cutting the edges between partitions, the Vertex Cut\npartitions the edges and duplicates the node information to preserve the graph\nstructure. Furthermore, the framework maintains high model accuracy by\nincorporating a reweighting mechanism to handle a distorted graph distribution\nthat arises from the duplicated nodes. We also propose a modified DropEdge\ntechnique to further speed up the training process. Using an extensive set of\nexperiments on real-world networks, we demonstrate that CoFree-GNN speeds up\nthe GNN training process by up to 10 times over the existing state-of-the-art\nGNN training approaches.\n","authors":["Kaidi Cao","Rui Deng","Shirley Wu","Edward W Huang","Karthik Subbian","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2308.03209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03203v1","updated":"2023-08-06T20:30:01Z","published":"2023-08-06T20:30:01Z","title":"Microvasculature Segmentation in Human BioMolecular Atlas Program\n  (HuBMAP)","summary":"  Image segmentation serves as a critical tool across a range of applications,\nencompassing autonomous driving's pedestrian detection and pre-operative tumor\ndelineation in the medical sector. Among these applications, we focus on the\nNational Institutes of Health's (NIH) Human BioMolecular Atlas Program\n(HuBMAP), a significant initiative aimed at creating detailed cellular maps of\nthe human body. In this study, we concentrate on segmenting various\nmicrovascular structures in human kidneys, utilizing 2D Periodic Acid-Schiff\n(PAS)-stained histology images. Our methodology begins with a foundational\nFastAI U-Net model, upon which we investigate alternative backbone\narchitectures, delve into deeper models, and experiment with Feature Pyramid\nNetworks. We rigorously evaluate these varied approaches by benchmarking their\nperformance against our baseline U-Net model. This study thus offers a\ncomprehensive exploration of cutting-edge segmentation techniques, providing\nvaluable insights for future research in the field.\n","authors":["Youssef Sultan","Yongqiang Wang","James Scanlon","Lisa D'lima"],"pdf_url":"https://arxiv.org/pdf/2308.03203v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.03202v1","updated":"2023-08-06T20:19:06Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":"  Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process.\n  To this end, we propose a new task, named source-free domain adaptive HPE,\nwhich aims to address the challenges of cross-domain learning of HPE without\naccess to source data during the adaptation process. We further propose a novel\nframework that consists of three models: source model, intermediate model, and\ntarget model, which explores the task from both source-protect and\ntarget-relevant perspectives. The source-protect module preserves source\ninformation more effectively while resisting noise, and the target-relevant\nmodule reduces the sparsity of spatial representations by building a novel\nspatial probability space, and pose-specific contrastive learning and\ninformation maximization are proposed on the basis of this space. Comprehensive\nexperiments on several domain adaptive HPE benchmarks show that the proposed\nmethod outperforms existing approaches by a considerable margin.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2008.08633v3","updated":"2023-08-06T20:17:31Z","published":"2020-08-19T18:56:49Z","title":"Spatio-Temporal EEG Representation Learning on Riemannian Manifold and\n  Euclidean Space","summary":"  We present a novel deep neural architecture for learning electroencephalogram\n(EEG). To learn the spatial information, our model first obtains the Riemannian\nmean and distance from spatial covariance matrices (SCMs) on a Riemannian\nmanifold. We then project the spatial information onto a Euclidean space via\ntangent space learning. Following, two fully connected layers are used to learn\nthe spatial information embeddings. Moreover, our proposed method learns the\ntemporal information via differential entropy and logarithm power spectrum\ndensity features extracted from EEG signals in a Euclidean space using a deep\nlong short-term memory network with a soft attention mechanism. To combine the\nspatial and temporal information, we use an effective fusion strategy, which\nlearns attention weights applied to embedding-specific features for decision\nmaking. We evaluate our proposed framework on four public datasets across three\npopular EEG-related tasks, notably emotion recognition, vigilance estimation,\nand motor imagery classification, containing various types of tasks such as\nbinary classification, multi-class classification, and regression. Our proposed\narchitecture outperforms other methods on SEED-VIG, and approaches the\nstate-of-the-art on the other three datasets (SEED, BCI-IV 2A, and BCI-IV 2B),\nshowing the robustness of our framework in EEG representation learning. The\nsource code of our paper is publicly available at\nhttps://github.com/guangyizhangbci/EEG_Riemannian.\n","authors":["Guangyi Zhang","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2008.08633v3.pdf","comment":"Accepted in IEEE Transactions on Emerging Topics in Computational\n  Intelligence. 15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.00143v2","updated":"2023-08-06T19:40:12Z","published":"2023-07-31T20:19:50Z","title":"Formally Explaining Neural Networks within Reactive Systems","summary":"  Deep neural networks (DNNs) are increasingly being used as controllers in\nreactive systems. However, DNNs are highly opaque, which renders it difficult\nto explain and justify their actions. To mitigate this issue, there has been a\nsurge of interest in explainable AI (XAI) techniques, capable of pinpointing\nthe input features that caused the DNN to act as it did. Existing XAI\ntechniques typically face two limitations: (i) they are heuristic, and do not\nprovide formal guarantees that the explanations are correct; and (ii) they\noften apply to ``one-shot'' systems, where the DNN is invoked independently of\npast invocations, as opposed to reactive systems. Here, we begin bridging this\ngap, and propose a formal DNN-verification-based XAI technique for reasoning\nabout multi-step, reactive systems. We suggest methods for efficiently\ncalculating succinct explanations, by exploiting the system's transition\nconstraints in order to curtail the search space explored by the underlying\nverifier. We evaluate our approach on two popular benchmarks from the domain of\nautomated navigation; and observe that our methods allow the efficient\ncomputation of minimal and minimum explanations, significantly outperforming\nthe state of the art. We also demonstrate that our methods produce formal\nexplanations that are more reliable than competing, non-verification-based XAI\ntechniques.\n","authors":["Shahaf Bassan","Guy Amir","Davide Corsi","Idan Refaeli","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2308.00143v2.pdf","comment":"To appear in Proc. 23rd Int. Conf. on Formal Methods in\n  Computer-Aided Design (FMCAD)"},{"id":"http://arxiv.org/abs/2303.08613v2","updated":"2023-08-06T19:25:02Z","published":"2023-03-15T13:40:16Z","title":"Learning to Incentivize Information Acquisition: Proper Scoring Rules\n  Meet Principal-Agent Model","summary":"  We study the incentivized information acquisition problem, where a principal\nhires an agent to gather information on her behalf. Such a problem is modeled\nas a Stackelberg game between the principal and the agent, where the principal\nannounces a scoring rule that specifies the payment, and then the agent then\nchooses an effort level that maximizes her own profit and reports the\ninformation. We study the online setting of such a problem from the principal's\nperspective, i.e., designing the optimal scoring rule by repeatedly interacting\nwith the strategic agent. We design a provably sample efficient algorithm that\ntailors the UCB algorithm (Auer et al., 2002) to our model, which achieves a\nsublinear $T^{2/3}$-regret after $T$ iterations. Our algorithm features a\ndelicate estimation procedure for the optimal profit of the principal, and a\nconservative correction scheme that ensures the desired agent's actions are\nincentivized. Furthermore, a key feature of our regret bound is that it is\nindependent of the number of states of the environment.\n","authors":["Siyu Chen","Jibang Wu","Yifan Wu","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2303.08613v2.pdf","comment":"35 pages, adding an impossible result (Lemma 3.2) with its proof in\n  Section D.1"},{"id":"http://arxiv.org/abs/2307.01292v2","updated":"2023-08-06T19:17:07Z","published":"2023-07-03T18:53:47Z","title":"Pareto-Secure Machine Learning (PSML): Fingerprinting and Securing\n  Inference Serving Systems","summary":"  Model-serving systems have become increasingly popular, especially in\nreal-time web applications. In such systems, users send queries to the server\nand specify the desired performance metrics (e.g., desired accuracy, latency).\nThe server maintains a set of models (model zoo) in the back-end and serves the\nqueries based on the specified metrics. This paper examines the security,\nspecifically robustness against model extraction attacks, of such systems.\nExisting black-box attacks assume a single model can be repeatedly selected for\nserving inference requests. Modern inference serving systems break this\nassumption. Thus, they cannot be directly applied to extract a victim model, as\nmodels are hidden behind a layer of abstraction exposed by the serving system.\nAn attacker can no longer identify which model she is interacting with. To this\nend, we first propose a query-efficient fingerprinting algorithm to enable the\nattacker to trigger any desired model consistently. We show that by using our\nfingerprinting algorithm, model extraction can have fidelity and accuracy\nscores within $1\\%$ of the scores obtained when attacking a single, explicitly\nspecified model, as well as up to $14.6\\%$ gain in accuracy and up to $7.7\\%$\ngain in fidelity compared to the naive attack. Second, we counter the proposed\nattack with a noise-based defense mechanism that thwarts fingerprinting by\nadding noise to the specified performance metrics. The proposed defense\nstrategy reduces the attack's accuracy and fidelity by up to $9.8\\%$ and\n$4.8\\%$, respectively (on medium-sized model extraction). Third, we show that\nthe proposed defense induces a fundamental trade-off between the level of\nprotection and system goodput, achieving configurable and significant victim\nmodel extraction protection while maintaining acceptable goodput ($>80\\%$). We\nimplement the proposed defense in a real system with plans to open source.\n","authors":["Debopam Sanyal","Jui-Tse Hung","Manav Agrawal","Prahlad Jasti","Shahab Nikkhoo","Somesh Jha","Tianhao Wang","Sibin Mohan","Alexey Tumanov"],"pdf_url":"https://arxiv.org/pdf/2307.01292v2.pdf","comment":"17 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2303.17156v2","updated":"2023-08-06T18:41:26Z","published":"2023-03-30T05:27:46Z","title":"MAHALO: Unifying Offline Reinforcement Learning and Imitation Learning\n  from Observations","summary":"  We study a new paradigm for sequential decision making, called offline policy\nlearning from observations (PLfO). Offline PLfO aims to learn policies using\ndatasets with substandard qualities: 1) only a subset of trajectories is\nlabeled with rewards, 2) labeled trajectories may not contain actions, 3)\nlabeled trajectories may not be of high quality, and 4) the data may not have\nfull coverage. Such imperfection is common in real-world learning scenarios,\nand offline PLfO encompasses many existing offline learning setups, including\noffline imitation learning (IL), offline IL from observations (ILfO), and\noffline reinforcement learning (RL). In this work, we present a generic\napproach to offline PLfO, called $\\textbf{M}$odality-agnostic\n$\\textbf{A}$dversarial $\\textbf{H}$ypothesis $\\textbf{A}$daptation for\n$\\textbf{L}$earning from $\\textbf{O}$bservations (MAHALO). Built upon the\npessimism concept in offline RL, MAHALO optimizes the policy using a\nperformance lower bound that accounts for uncertainty due to the dataset's\ninsufficient coverage. We implement this idea by adversarially training\ndata-consistent critic and reward functions, which forces the learned policy to\nbe robust to data deficiency. We show that MAHALO consistently outperforms or\nmatches specialized algorithms across a variety of offline PLfO tasks in theory\nand experiments. Our code is available at https://github.com/AnqiLi/mahalo.\n","authors":["Anqi Li","Byron Boots","Ching-An Cheng"],"pdf_url":"https://arxiv.org/pdf/2303.17156v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03186v1","updated":"2023-08-06T18:35:54Z","published":"2023-08-06T18:35:54Z","title":"A Lightweight Method for Modeling Confidence in Recommendations with\n  Learned Beta Distributions","summary":"  Most Recommender Systems (RecSys) do not provide an indication of confidence\nin their decisions. Therefore, they do not distinguish between recommendations\nof which they are certain, and those where they are not. Existing confidence\nmethods for RecSys are either inaccurate heuristics, conceptually complex or\ncomputationally very expensive. Consequently, real-world RecSys applications\nrarely adopt these methods, and thus, provide no confidence insights in their\nbehavior. In this work, we propose learned beta distributions (LBD) as a simple\nand practical recommendation method with an explicit measure of confidence. Our\nmain insight is that beta distributions predict user preferences as probability\ndistributions that naturally model confidence on a closed interval, yet can be\nimplemented with the minimal model-complexity. Our results show that LBD\nmaintains competitive accuracy to existing methods while also having a\nsignificantly stronger correlation between its accuracy and confidence.\nFurthermore, LBD has higher performance when applied to a high-precision\ntargeted recommendation task. Our work thus shows that confidence in RecSys is\npossible without sacrificing simplicity or accuracy, and without introducing\nheavy computational complexity. Thereby, we hope it enables better insight into\nreal-world RecSys and opens the door for novel future applications.\n","authors":["Norman Knyazev","Harrie Oosterhuis"],"pdf_url":"https://arxiv.org/pdf/2308.03186v1.pdf","comment":"In Proceedings of the 17th ACM Conference on Recommender Systems\n  (RecSys '23), September 18-22, 2023, Singapore, Singapore. ACM, New York, NY,\n  USA, 12 pages"},{"id":"http://arxiv.org/abs/1906.02635v2","updated":"2023-08-06T18:15:01Z","published":"2019-06-06T15:11:01Z","title":"Counterfactual Inference for Consumer Choice Across Many Product\n  Categories","summary":"  This paper proposes a method for estimating consumer preferences among\ndiscrete choices, where the consumer chooses at most one product in a category,\nbut selects from multiple categories in parallel. The consumer's utility is\nadditive in the different categories. Her preferences about product attributes\nas well as her price sensitivity vary across products and are in general\ncorrelated across products. We build on techniques from the machine learning\nliterature on probabilistic models of matrix factorization, extending the\nmethods to account for time-varying product attributes and products going out\nof stock. We evaluate the performance of the model using held-out data from\nweeks with price changes or out of stock products. We show that our model\nimproves over traditional modeling approaches that consider each category in\nisolation. One source of the improvement is the ability of the model to\naccurately estimate heterogeneity in preferences (by pooling information across\ncategories); another source of improvement is its ability to estimate the\npreferences of consumers who have rarely or never made a purchase in a given\ncategory in the training data. Using held-out data, we show that our model can\naccurately distinguish which consumers are most price sensitive to a given\nproduct. We consider counterfactuals such as personally targeted price\ndiscounts, showing that using a richer model such as the one we propose\nsubstantially increases the benefits of personalization in discounts.\n","authors":["Rob Donnelly","Francisco R. Ruiz","David Blei","Susan Athey"],"pdf_url":"https://arxiv.org/pdf/1906.02635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03175v1","updated":"2023-08-06T18:05:39Z","published":"2023-08-06T18:05:39Z","title":"Adapting Machine Learning Diagnostic Models to New Populations Using a\n  Small Amount of Data: Results from Clinical Neuroscience","summary":"  Machine learning (ML) has shown great promise for revolutionizing a number of\nareas, including healthcare. However, it is also facing a reproducibility\ncrisis, especially in medicine. ML models that are carefully constructed from\nand evaluated on a training set might not generalize well on data from\ndifferent patient populations or acquisition instrument settings and protocols.\nWe tackle this problem in the context of neuroimaging of Alzheimer's disease\n(AD), schizophrenia (SZ) and brain aging. We develop a weighted empirical risk\nminimization approach that optimally combines data from a source group, e.g.,\nsubjects are stratified by attributes such as sex, age group, race and clinical\ncohort to make predictions on a target group, e.g., other sex, age group, etc.\nusing a small fraction (10%) of data from the target group. We apply this\nmethod to multi-source data of 15,363 individuals from 20 neuroimaging studies\nto build ML models for diagnosis of AD and SZ, and estimation of brain age. We\nfound that this approach achieves substantially better accuracy than existing\ndomain adaptation techniques: it obtains area under curve greater than 0.95 for\nAD classification, area under curve greater than 0.7 for SZ classification and\nmean absolute error less than 5 years for brain age prediction on all target\ngroups, achieving robustness to variations of scanners, protocols, and\ndemographic or clinical characteristics. In some cases, it is even better than\ntraining on all data from the target group, because it leverages the diversity\nand size of a larger training set. We also demonstrate the utility of our\nmodels for prognostic tasks such as predicting disease progression in\nindividuals with mild cognitive impairment. Critically, our brain age\nprediction models lead to new clinical insights regarding correlations with\nneurophysiological tests.\n","authors":["Rongguang Wang","Guray Erus","Pratik Chaudhari","Christos Davatzikos"],"pdf_url":"https://arxiv.org/pdf/2308.03175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03172v1","updated":"2023-08-06T17:59:14Z","published":"2023-08-06T17:59:14Z","title":"Two Sides of Miscalibration: Identifying Over and Under-Confidence\n  Prediction for Network Calibration","summary":"  Proper confidence calibration of deep neural networks is essential for\nreliable predictions in safety-critical tasks. Miscalibration can lead to model\nover-confidence and/or under-confidence; i.e., the model's confidence in its\nprediction can be greater or less than the model's accuracy. Recent studies\nhave highlighted the over-confidence issue by introducing calibration\ntechniques and demonstrated success on various tasks. However, miscalibration\nthrough under-confidence has not yet to receive much attention. In this paper,\nwe address the necessity of paying attention to the under-confidence issue. We\nfirst introduce a novel metric, a miscalibration score, to identify the overall\nand class-wise calibration status, including being over or under-confident. Our\nproposed metric reveals the pitfalls of existing calibration techniques, where\nthey often overly calibrate the model and worsen under-confident predictions.\nThen we utilize the class-wise miscalibration score as a proxy to design a\ncalibration technique that can tackle both over and under-confidence. We report\nextensive experiments that show our proposed methods substantially\noutperforming existing calibration techniques. We also validate our proposed\ncalibration technique on an automatic failure detection task with a\nrisk-coverage curve, reporting that our methods improve failure detection as\nwell as trustworthiness of the model. The code are available at\n\\url{https://github.com/AoShuang92/miscalibration_TS}.\n","authors":["Shuang Ao","Stefan Rueger","Advaith Siddharthan"],"pdf_url":"https://arxiv.org/pdf/2308.03172v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.03171v1","updated":"2023-08-06T17:51:22Z","published":"2023-08-06T17:51:22Z","title":"Detection of Anomalies in Multivariate Time Series Using Ensemble\n  Techniques","summary":"  Anomaly Detection in multivariate time series is a major problem in many\nfields. Due to their nature, anomalies sparsely occur in real data, thus making\nthe task of anomaly detection a challenging problem for classification\nalgorithms to solve. Methods that are based on Deep Neural Networks such as\nLSTM, Autoencoders, Convolutional Autoencoders etc., have shown positive\nresults in such imbalanced data. However, the major challenge that algorithms\nface when applied to multivariate time series is that the anomaly can arise\nfrom a small subset of the feature set. To boost the performance of these base\nmodels, we propose a feature-bagging technique that considers only a subset of\nfeatures at a time, and we further apply a transformation that is based on\nnested rotation computed from Principal Component Analysis (PCA) to improve the\neffectiveness and generalization of the approach. To further enhance the\nprediction performance, we propose an ensemble technique that combines multiple\nbase models toward the final decision. In addition, a semi-supervised approach\nusing a Logistic Regressor to combine the base models' outputs is proposed. The\nproposed methodology is applied to the Skoltech Anomaly Benchmark (SKAB)\ndataset, which contains time series data related to the flow of water in a\nclosed circuit, and the experimental results show that the proposed ensemble\ntechnique outperforms the basic algorithms. More specifically, the performance\nimprovement in terms of anomaly detection accuracy reaches 2% for the\nunsupervised and at least 10% for the semi-supervised models.\n","authors":["Anastasios Iliopoulos","John Violos","Christos Diou","Iraklis Varlamis"],"pdf_url":"https://arxiv.org/pdf/2308.03171v1.pdf","comment":"Accepted for publication in the 2023 IEEE Big Data Service conference"},{"id":"http://arxiv.org/abs/2302.11408v2","updated":"2023-08-06T17:24:21Z","published":"2023-02-22T14:43:33Z","title":"ASSET: Robust Backdoor Data Detection Across a Multiplicity of Deep\n  Learning Paradigms","summary":"  Backdoor data detection is traditionally studied in an end-to-end supervised\nlearning (SL) setting. However, recent years have seen the proliferating\nadoption of self-supervised learning (SSL) and transfer learning (TL), due to\ntheir lesser need for labeled data. Successful backdoor attacks have also been\ndemonstrated in these new settings. However, we lack a thorough understanding\nof the applicability of existing detection methods across a variety of learning\nsettings. By evaluating 56 attack settings, we show that the performance of\nmost existing detection methods varies significantly across different attacks\nand poison ratios, and all fail on the state-of-the-art clean-label attack. In\naddition, they either become inapplicable or suffer large performance losses\nwhen applied to SSL and TL. We propose a new detection method called Active\nSeparation via Offset (ASSET), which actively induces different model behaviors\nbetween the backdoor and clean samples to promote their separation. We also\nprovide procedures to adaptively select the number of suspicious points to\nremove. In the end-to-end SL setting, ASSET is superior to existing methods in\nterms of consistency of defensive performance across different attacks and\nrobustness to changes in poison ratios; in particular, it is the only method\nthat can detect the state-of-the-art clean-label attack. Moreover, ASSET's\naverage detection rates are higher than the best existing methods in SSL and\nTL, respectively, by 69.3% and 33.2%, thus providing the first practical\nbackdoor defense for these new DL settings. We open-source the project to drive\nfurther development and encourage engagement:\nhttps://github.com/ruoxi-jia-group/ASSET.\n","authors":["Minzhou Pan","Yi Zeng","Lingjuan Lyu","Xue Lin","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2302.11408v2.pdf","comment":"18 pages, with 13 pages of main text"},{"id":"http://arxiv.org/abs/2308.03164v1","updated":"2023-08-06T17:19:51Z","published":"2023-08-06T17:19:51Z","title":"FireFly A Synthetic Dataset for Ember Detection in Wildfire","summary":"  This paper presents \"FireFly\", a synthetic dataset for ember detection\ncreated using Unreal Engine 4 (UE4), designed to overcome the current lack of\nember-specific training resources. To create the dataset, we present a tool\nthat allows the automated generation of the synthetic labeled dataset with\nadjustable parameters, enabling data diversity from various environmental\nconditions, making the dataset both diverse and customizable based on user\nrequirements. We generated a total of 19,273 frames that have been used to\nevaluate FireFly on four popular object detection models. Further to minimize\nhuman intervention, we leveraged a trained model to create a semi-automatic\nlabeling process for real-life ember frames. Moreover, we demonstrated an up to\n8.57% improvement in mean Average Precision (mAP) in real-world wildfire\nscenarios compared to models trained exclusively on a small real dataset.\n","authors":["Yue Hu","Xinan Ye","Yifei Liu","Souvik Kundu","Gourav Datta","Srikar Mutnuri","Namo Asavisanu","Nora Ayanian","Konstantinos Psounis","Peter Beerel"],"pdf_url":"https://arxiv.org/pdf/2308.03164v1.pdf","comment":"Artificial Intelligence (AI) and Humanitarian Assistance and Disaster\n  Recovery (HADR) workshop, ICCV 2023 in Paris, France"},{"id":"http://arxiv.org/abs/2205.06900v2","updated":"2023-08-06T16:48:12Z","published":"2022-05-13T21:32:24Z","title":"MM-BD: Post-Training Detection of Backdoor Attacks with Arbitrary\n  Backdoor Pattern Types Using a Maximum Margin Statistic","summary":"  Backdoor attacks are an important type of adversarial threat against deep\nneural network classifiers, wherein test samples from one or more source\nclasses will be (mis)classified to the attacker's target class when a backdoor\npattern is embedded. In this paper, we focus on the post-training backdoor\ndefense scenario commonly considered in the literature, where the defender aims\nto detect whether a trained classifier was backdoor-attacked without any access\nto the training set. Many post-training detectors are designed to detect\nattacks that use either one or a few specific backdoor embedding functions\n(e.g., patch-replacement or additive attacks). These detectors may fail when\nthe backdoor embedding function used by the attacker (unknown to the defender)\nis different from the backdoor embedding function assumed by the defender. In\ncontrast, we propose a post-training defense that detects backdoor attacks with\narbitrary types of backdoor embeddings, without making any assumptions about\nthe backdoor embedding type. Our detector leverages the influence of the\nbackdoor attack, independent of the backdoor embedding mechanism, on the\nlandscape of the classifier's outputs prior to the softmax layer. For each\nclass, a maximum margin statistic is estimated. Detection inference is then\nperformed by applying an unsupervised anomaly detector to these statistics.\nThus, our detector does not need any legitimate clean samples, and can\nefficiently detect backdoor attacks with arbitrary numbers of source classes.\nThese advantages over several state-of-the-art methods are demonstrated on four\ndatasets, for three different types of backdoor patterns, and for a variety of\nattack configurations. Finally, we propose a novel, general approach for\nbackdoor mitigation once a detection is made. The mitigation approach was the\nrunner-up at the first IEEE Trojan Removal Competition. The code is online\navailable.\n","authors":["Hang Wang","Zhen Xiang","David J. Miller","George Kesidis"],"pdf_url":"https://arxiv.org/pdf/2205.06900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04785v2","updated":"2023-08-06T16:29:24Z","published":"2023-01-12T02:25:22Z","title":"Phase-shifted Adversarial Training","summary":"  Adversarial training has been considered an imperative component for safely\ndeploying neural network-based applications to the real world. To achieve\nstronger robustness, existing methods primarily focus on how to generate strong\nattacks by increasing the number of update steps, regularizing the models with\nthe smoothed loss function, and injecting the randomness into the attack.\nInstead, we analyze the behavior of adversarial training through the lens of\nresponse frequency. We empirically discover that adversarial training causes\nneural networks to have low convergence to high-frequency information,\nresulting in highly oscillated predictions near each data. To learn\nhigh-frequency contents efficiently and effectively, we first prove that a\nuniversal phenomenon of frequency principle, i.e., \\textit{lower frequencies\nare learned first}, still holds in adversarial training. Based on that, we\npropose phase-shifted adversarial training (PhaseAT) in which the model learns\nhigh-frequency components by shifting these frequencies to the low-frequency\nrange where the fast convergence occurs. For evaluations, we conduct the\nexperiments on CIFAR-10 and ImageNet with the adaptive attack carefully\ndesigned for reliable evaluation. Comprehensive results show that PhaseAT\nsignificantly improves the convergence for high-frequency information. This\nresults in improved adversarial robustness by enabling the model to have\nsmoothed predictions near each data.\n","authors":["Yeachan Kim","Seongyeon Kim","Ihyeok Seo","Bonggun Shin"],"pdf_url":"https://arxiv.org/pdf/2301.04785v2.pdf","comment":"Proceedings of Uncertainty in Artificial Intelligence, 2023 (UAI\n  2023)"},{"id":"http://arxiv.org/abs/2308.03152v1","updated":"2023-08-06T15:59:30Z","published":"2023-08-06T15:59:30Z","title":"AI-GOMS: Large AI-Driven Global Ocean Modeling System","summary":"  Ocean modeling is a powerful tool for simulating the physical, chemical, and\nbiological processes of the ocean, which is the foundation for marine science\nresearch and operational oceanography. Modern numerical ocean modeling mainly\nconsists of governing equations and numerical algorithms. Nonlinear\ninstability, computational expense, low reusability efficiency and high\ncoupling costs have gradually become the main bottlenecks for the further\ndevelopment of numerical ocean modeling. Recently, artificial\nintelligence-based modeling in scientific computing has shown revolutionary\npotential for digital twins and scientific simulations, but the bottlenecks of\nnumerical ocean modeling have not been further solved. Here, we present\nAI-GOMS, a large AI-driven global ocean modeling system, for accurate and\nefficient global ocean daily prediction. AI-GOMS consists of a backbone model\nwith the Fourier-based Masked Autoencoder structure for basic ocean variable\nprediction and lightweight fine-tuning models incorporating regional\ndownscaling, wave decoding, and biochemistry coupling modules. AI-GOMS has\nachieved the best performance in 30 days of prediction for the global ocean\nbasic variables with 15 depth layers at 1/4{\\deg} spatial resolution. Beyond\nthe good performance in statistical metrics, AI-GOMS realizes the simulation of\nmesoscale eddies in the Kuroshio region at 1/12{\\deg} spatial resolution and\nocean stratification in the tropical Pacific Ocean. AI-GOMS provides a new\nbackbone-downstream paradigm for Earth system modeling, which makes the system\ntransferable, scalable and reusable.\n","authors":["Wei Xiong","Yanfei Xiang","Hao Wu","Shuyi Zhou","Yuze Sun","Muyuan Ma","Xiaomeng Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03142v1","updated":"2023-08-06T15:38:44Z","published":"2023-08-06T15:38:44Z","title":"Self-Directed Linear Classification","summary":"  In online classification, a learner is presented with a sequence of examples\nand aims to predict their labels in an online fashion so as to minimize the\ntotal number of mistakes. In the self-directed variant, the learner knows in\nadvance the pool of examples and can adaptively choose the order in which\npredictions are made. Here we study the power of choosing the prediction order\nand establish the first strong separation between worst-order and random-order\nlearning for the fundamental task of linear classification. Prior to our work,\nsuch a separation was known only for very restricted concept classes, e.g.,\none-dimensional thresholds or axis-aligned rectangles.\n  We present two main results. If $X$ is a dataset of $n$ points drawn\nuniformly at random from the $d$-dimensional unit sphere, we design an\nefficient self-directed learner that makes $O(d \\log \\log(n))$ mistakes and\nclassifies the entire dataset. If $X$ is an arbitrary $d$-dimensional dataset\nof size $n$, we design an efficient self-directed learner that predicts the\nlabels of $99\\%$ of the points in $X$ with mistake bound independent of $n$. In\ncontrast, under a worst- or random-ordering, the number of mistakes must be at\nleast $\\Omega(d \\log n)$, even when the points are drawn uniformly from the\nunit sphere and the learner only needs to predict the labels for $1\\%$ of them.\n","authors":["Ilias Diakonikolas","Vasilis Kontonis","Christos Tzamos","Nikos Zarifis"],"pdf_url":"https://arxiv.org/pdf/2308.03142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00628v2","updated":"2023-08-06T14:47:00Z","published":"2023-08-01T15:55:41Z","title":"Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation\n  in Outdoor Scenes","summary":"  3D human pose estimation in outdoor environments has garnered increasing\nattention recently. However, prevalent 3D human pose datasets pertaining to\noutdoor scenes lack diversity, as they predominantly utilize only one type of\nmodality (RGB image or pointcloud), and often feature only one individual\nwithin each scene. This limited scope of dataset infrastructure considerably\nhinders the variability of available data. In this article, we propose\nHuman-M3, an outdoor multi-modal multi-view multi-person human pose database\nwhich includes not only multi-view RGB videos of outdoor scenes but also\ncorresponding pointclouds. In order to obtain accurate human poses, we propose\nan algorithm based on multi-modal data input to generate ground truth\nannotation. This benefits from robust pointcloud detection and tracking, which\nsolves the problem of inaccurate human localization and matching ambiguity that\nmay exist in previous multi-view RGB videos in outdoor multi-person scenes, and\ngenerates reliable ground truth annotations. Evaluation of multiple different\nmodalities algorithms has shown that this database is challenging and suitable\nfor future research. Furthermore, we propose a 3D human pose estimation\nalgorithm based on multi-modal data input, which demonstrates the advantages of\nmulti-modal data input for 3D human pose estimation. Code and data will be\nreleased on https://github.com/soullessrobot/Human-M3-Dataset.\n","authors":["Bohao Fan","Siqi Wang","Wenxuan Guo","Wenzhao Zheng","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.00628v2.pdf","comment":"Code and data will be released on\n  https://github.com/soullessrobot/Human-M3-Dataset"},{"id":"http://arxiv.org/abs/2308.03128v1","updated":"2023-08-06T14:36:57Z","published":"2023-08-06T14:36:57Z","title":"Iterative Magnitude Pruning as a Renormalisation Group: A Study in The\n  Context of The Lottery Ticket Hypothesis","summary":"  This thesis delves into the intricate world of Deep Neural Networks (DNNs),\nfocusing on the exciting concept of the Lottery Ticket Hypothesis (LTH). The\nLTH posits that within extensive DNNs, smaller, trainable subnetworks termed\n\"winning tickets\", can achieve performance comparable to the full model. A key\nprocess in LTH, Iterative Magnitude Pruning (IMP), incrementally eliminates\nminimal weights, emulating stepwise learning in DNNs. Once we identify these\nwinning tickets, we further investigate their \"universality\". In other words,\nwe check if a winning ticket that works well for one specific problem could\nalso work well for other, similar problems. We also bridge the divide between\nthe IMP and the Renormalisation Group (RG) theory in physics, promoting a more\nrigorous understanding of IMP.\n","authors":["Abu-Al Hassan"],"pdf_url":"https://arxiv.org/pdf/2308.03128v1.pdf","comment":"MSci thesis, 40 pages, 13 figures"},{"id":"http://arxiv.org/abs/1909.05207v3","updated":"2023-08-06T14:24:26Z","published":"2019-09-07T19:06:23Z","title":"Introduction to Online Convex Optimization","summary":"  This manuscript portrays optimization as a process. In many practical\napplications the environment is so complex that it is infeasible to lay out a\ncomprehensive theoretical model and use classical algorithmic theory and\nmathematical optimization. It is necessary as well as beneficial to take a\nrobust approach, by applying an optimization method that learns as one goes\nalong, learning from experience as more aspects of the problem are observed.\nThis view of optimization as a process has become prominent in varied fields\nand has led to some spectacular success in modeling and systems that are now\npart of our daily lives.\n","authors":["Elad Hazan"],"pdf_url":"https://arxiv.org/pdf/1909.05207v3.pdf","comment":"arXiv admin note: text overlap with arXiv:1909.03550"},{"id":"http://arxiv.org/abs/2302.13417v3","updated":"2023-08-06T14:23:34Z","published":"2023-02-26T22:10:23Z","title":"Training neural networks with structured noise improves classification\n  and generalization","summary":"  The beneficial role of noise in learning is nowadays a consolidated concept\nin the field of artificial neural networks, suggesting that even biological\nsystems might take advantage of similar mechanisms to maximize their\nperformance. The training-with-noise algorithm proposed by Gardner and\ncollaborators is an emblematic example of a noise injection procedure in\nrecurrent networks, which are usually employed to model real neural systems. We\nshow how adding structure into noisy training data can substantially improve\nthe algorithm performance, allowing to approach perfect classification and\nmaximal basins of attraction. We also prove that the so-called Hebbian\nunlearning rule coincides with the training-with-noise algorithm when noise is\nmaximal and data are fixed points of the network dynamics. A sampling scheme\nfor optimal noisy data is eventually proposed and implemented to outperform\nboth the training-with-noise and the Hebbian unlearning procedures.\n","authors":["Marco Benedetti","Enrico Ventura"],"pdf_url":"https://arxiv.org/pdf/2302.13417v3.pdf","comment":"21 pages, 17 figures, main text and appendices"},{"id":"http://arxiv.org/abs/2307.01759v2","updated":"2023-08-06T14:22:46Z","published":"2023-07-04T15:00:06Z","title":"Pretraining is All You Need: A Multi-Atlas Enhanced Transformer\n  Framework for Autism Spectrum Disorder Classification","summary":"  Autism spectrum disorder (ASD) is a prevalent psychiatric condition\ncharacterized by atypical cognitive, emotional, and social patterns. Timely and\naccurate diagnosis is crucial for effective interventions and improved outcomes\nin individuals with ASD. In this study, we propose a novel Multi-Atlas Enhanced\nTransformer framework, METAFormer, ASD classification. Our framework utilizes\nresting-state functional magnetic resonance imaging data from the ABIDE I\ndataset, comprising 406 ASD and 476 typical control (TC) subjects. METAFormer\nemploys a multi-atlas approach, where flattened connectivity matrices from the\nAAL, CC200, and DOS160 atlases serve as input to the transformer encoder.\nNotably, we demonstrate that self-supervised pretraining, involving the\nreconstruction of masked values from the input, significantly enhances\nclassification performance without the need for additional or separate training\ndata. Through stratified cross-validation, we evaluate the proposed framework\nand show that it surpasses state-of-the-art performance on the ABIDE I dataset,\nwith an average accuracy of 83.7% and an AUC-score of 0.832. The code for our\nframework is available at https://github.com/Lugges991/METAFormer\n","authors":["Lucas Mahler","Qi Wang","Julius Steiglechner","Florian Birk","Samuel Heczko","Klaus Scheffler","Gabriele Lohmann"],"pdf_url":"https://arxiv.org/pdf/2307.01759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03102v1","updated":"2023-08-06T12:40:17Z","published":"2023-08-06T12:40:17Z","title":"Learning-Rate-Free Learning: Dissecting D-Adaptation and Probabilistic\n  Line Search","summary":"  This paper explores two recent methods for learning rate optimisation in\nstochastic gradient descent: D-Adaptation (arXiv:2301.07733) and probabilistic\nline search (arXiv:1502.02846). These approaches aim to alleviate the burden of\nselecting an initial learning rate by incorporating distance metrics and\nGaussian process posterior estimates, respectively. In this report, I provide\nan intuitive overview of both methods, discuss their shared design goals, and\ndevise scope for merging the two algorithms.\n","authors":["Max McGuinness"],"pdf_url":"https://arxiv.org/pdf/2308.03102v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.03226v1","updated":"2023-08-06T23:16:54Z","published":"2023-08-06T23:16:54Z","title":"Investigation of Self-supervised Pre-trained Models for Classification\n  of Voice Quality from Speech and Neck Surface Accelerometer Signals","summary":"  Prior studies in the automatic classification of voice quality have mainly\nstudied the use of the acoustic speech signal as input. Recently, a few studies\nhave been carried out by jointly using both speech and neck surface\naccelerometer (NSA) signals as inputs, and by extracting MFCCs and glottal\nsource features. This study examines simultaneously-recorded speech and NSA\nsignals in the classification of voice quality (breathy, modal, and pressed)\nusing features derived from three self-supervised pre-trained models\n(wav2vec2-BASE, wav2vec2-LARGE, and HuBERT) and using a SVM as well as CNNs as\nclassifiers. Furthermore, the effectiveness of the pre-trained models is\ncompared in feature extraction between glottal source waveforms and raw signal\nwaveforms for both speech and NSA inputs. Using two signal processing methods\n(quasi-closed phase (QCP) glottal inverse filtering and zero frequency\nfiltering (ZFF)), glottal source waveforms are estimated from both speech and\nNSA signals. The study has three main goals: (1) to study whether features\nderived from pre-trained models improve classification accuracy compared to\nconventional features (spectrogram, mel-spectrogram, MFCCs, i-vector, and\nx-vector), (2) to investigate which of the two modalities (speech vs. NSA) is\nmore effective in the classification task with pre-trained model-based\nfeatures, and (3) to evaluate whether the deep learning-based CNN classifier\ncan enhance the classification accuracy in comparison to the SVM classifier.\nThe results revealed that the use of the NSA input showed better classification\nperformance compared to the speech signal. Between the features, the\npre-trained model-based features showed better classification accuracies, both\nfor speech and NSA inputs compared to the conventional features. It was also\nfound that the HuBERT features performed better than the wav2vec2-BASE and\nwav2vec2-LARGE features.\n","authors":["Sudarsana Reddy Kadiri","Farhad Javanmardi","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2308.03226v1.pdf","comment":"Accepted by Computer Speech & Language"},{"id":"http://arxiv.org/abs/2308.03165v1","updated":"2023-08-06T17:21:31Z","published":"2023-08-06T17:21:31Z","title":"MetaCast: A Self-Driven Metaverse Announcer Architecture Based on\n  Quality of Experience Evaluation Model","summary":"  Metaverse provides users with a novel experience through immersive multimedia\ntechnologies. Along with the rapid user growth, numerous events bursting in the\nmetaverse necessitate an announcer to help catch and monitor ongoing events.\nHowever, systems on the market primarily serve for esports competitions and\nrely on human directors, making it challenging to provide 24-hour delivery in\nthe metaverse persistent world. To fill the blank, we proposed a three-stage\narchitecture for metaverse announcers, which is designed to identify events,\nposition cameras, and blend between shots. Based on the architecture, we\nintroduced a Metaverse Announcer User Experience (MAUE) model to identify the\nfactors affecting the users' Quality of Experience (QoE) from a human-centered\nperspective. In addition, we implemented \\textit{MetaCast}, a practical\nself-driven metaverse announcer in a university campus metaverse prototype, to\nconduct user studies for MAUE model. The experimental results have effectively\nachieved satisfactory announcer settings that align with the preferences of\nmost users, encompassing parameters such as video transition rate, repetition\nrate, importance threshold value, and image composition.\n","authors":["Zhonghao Lin","Haihan Duan","Jiaye Li","Xinyao Sun","Wei Cai"],"pdf_url":"https://arxiv.org/pdf/2308.03165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03113v1","updated":"2023-08-06T13:39:23Z","published":"2023-08-06T13:39:23Z","title":"Semantic-Guided Feature Distillation for Multimodal Recommendation","summary":"  Multimodal recommendation exploits the rich multimodal information associated\nwith users or items to enhance the representation learning for better\nperformance. In these methods, end-to-end feature extractors (e.g.,\nshallow/deep neural networks) are often adopted to tailor the generic\nmultimodal features that are extracted from raw data by pre-trained models for\nrecommendation. However, compact extractors, such as shallow neural networks,\nmay find it challenging to extract effective information from complex and\nhigh-dimensional generic modality features. Conversely, DNN-based extractors\nmay encounter the data sparsity problem in recommendation. To address this\nproblem, we propose a novel model-agnostic approach called Semantic-guided\nFeature Distillation (SGFD), which employs a teacher-student framework to\nextract feature for multimodal recommendation. The teacher model first extracts\nrich modality features from the generic modality feature by considering both\nthe semantic information of items and the complementary information of multiple\nmodalities. SGFD then utilizes response-based and feature-based distillation\nloss to effectively transfer the knowledge encoded in the teacher model to the\nstudent model. To evaluate the effectiveness of our SGFD, we integrate SGFD\ninto three backbone multimodal recommendation models. Extensive experiments on\nthree public real-world datasets demonstrate that SGFD-enhanced models can\nachieve substantial improvement over their counterparts.\n","authors":["Fan Liu","Huilin Chen","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2308.03113v1.pdf","comment":"ACM Multimedia 2023 Accepted"},{"id":"http://arxiv.org/abs/2305.11769v2","updated":"2023-08-06T12:29:21Z","published":"2023-05-19T15:54:40Z","title":"Enhancing Vision-Language Pre-Training with Jointly Learned Questioner\n  and Dense Captioner","summary":"  Large pre-trained multimodal models have demonstrated significant success in\na range of downstream tasks, including image captioning, image-text retrieval,\nvisual question answering (VQA), etc. However, many of these methods rely on\nimage-text pairs collected from the web as pre-training data and unfortunately\noverlook the need for fine-grained feature alignment between vision and\nlanguage modalities, which requires detailed understanding of images and\nlanguage expressions. While integrating VQA and dense captioning (DC) into\npre-training can address this issue, acquiring image-question-answer as well as\nimage-location-caption triplets is challenging and time-consuming.\nAdditionally, publicly available datasets for VQA and dense captioning are\ntypically limited in scale due to manual data collection and labeling efforts.\nIn this paper, we propose a novel method called Joint QA and DC GEneration\n(JADE), which utilizes a pre-trained multimodal model and easily-crawled\nimage-text pairs to automatically generate and filter large-scale VQA and dense\ncaptioning datasets. We apply this method to the Conceptual Caption (CC3M)\ndataset to generate a new dataset called CC3M-QA-DC. Experiments show that when\nused for pre-training in a multi-task manner, CC3M-QA-DC can improve the\nperformance with various backbones on various downstream tasks. Furthermore,\nour generated CC3M-QA-DC can be combined with larger image-text datasets (e.g.,\nCC15M) and achieve competitive results compared with models using much more\ndata. Code and dataset are available at\nhttps://github.com/johncaged/OPT_Questioner.\n","authors":["Zikang Liu","Sihan Chen","Longteng Guo","Handong Li","Xingjian He","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2305.11769v2.pdf","comment":"12 pages. Accepted by ACM MM '23"},{"id":"http://arxiv.org/abs/2305.05880v2","updated":"2023-08-06T10:43:25Z","published":"2023-05-10T04:00:54Z","title":"ChinaOpen: A Dataset for Open-world Multimodal Learning","summary":"  This paper introduces ChinaOpen, a dataset sourced from Bilibili, a popular\nChinese video-sharing website, for open-world multimodal learning. While the\nstate-of-the-art multimodal learning networks have shown impressive performance\nin automated video annotation and cross-modal video retrieval, their training\nand evaluation are primarily conducted on YouTube videos with English text.\nTheir effectiveness on Chinese data remains to be verified. In order to support\nmultimodal learning in the new context, we construct ChinaOpen-50k, a webly\nannotated training set of 50k Bilibili videos associated with user-generated\ntitles and tags. Both text-based and content-based data cleaning are performed\nto remove low-quality videos in advance. For a multi-faceted evaluation, we\nbuild ChinaOpen-1k, a manually labeled test set of 1k videos. Each test video\nis accompanied with a manually checked user title and a manually written\ncaption. Besides, each video is manually tagged to describe objects / actions /\nscenes shown in the visual content. The original user tags are also manually\nchecked. Moreover, with all the Chinese text translated into English,\nChinaOpen-1k is also suited for evaluating models trained on English data. In\naddition to ChinaOpen, we propose Generative Video-to-text Transformer (GVT)\nfor Chinese video captioning. We conduct an extensive evaluation of the\nstate-of-the-art single-task / multi-task models on the new dataset, resulting\nin a number of novel findings and insights.\n","authors":["Aozhu Chen","Ziyuan Wang","Chengbo Dong","Kaibin Tian","Ruixiang Zhao","Xun Liang","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2305.05880v2.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.03063v1","updated":"2023-08-06T09:15:14Z","published":"2023-08-06T09:15:14Z","title":"M$^3$Net: Multi-view Encoding, Matching, and Fusion for Few-shot\n  Fine-grained Action Recognition","summary":"  Due to the scarcity of manually annotated data required for fine-grained\nvideo understanding, few-shot fine-grained (FS-FG) action recognition has\ngained significant attention, with the aim of classifying novel fine-grained\naction categories with only a few labeled instances. Despite the progress made\nin FS coarse-grained action recognition, current approaches encounter two\nchallenges when dealing with the fine-grained action categories: the inability\nto capture subtle action details and the insufficiency of learning from limited\ndata that exhibit high intra-class variance and inter-class similarity. To\naddress these limitations, we propose M$^3$Net, a matching-based framework for\nFS-FG action recognition, which incorporates \\textit{multi-view encoding},\n\\textit{multi-view matching}, and \\textit{multi-view fusion} to facilitate\nembedding encoding, similarity matching, and decision making across multiple\nviewpoints. \\textit{Multi-view encoding} captures rich contextual details from\nthe intra-frame, intra-video, and intra-episode perspectives, generating\ncustomized higher-order embeddings for fine-grained data. \\textit{Multi-view\nmatching} integrates various matching functions enabling flexible relation\nmodeling within limited samples to handle multi-scale spatio-temporal\nvariations by leveraging the instance-specific, category-specific, and\ntask-specific perspectives. \\textit{Multi-view fusion} consists of\nmatching-predictions fusion and matching-losses fusion over the above views,\nwhere the former promotes mutual complementarity and the latter enhances\nembedding generalizability by employing multi-task collaborative learning.\nExplainable visualizations and experimental results on three challenging\nbenchmarks demonstrate the superiority of M$^3$Net in capturing fine-grained\naction details and achieving state-of-the-art performance for FS-FG action\nrecognition.\n","authors":["Hao Tang","Jun Liu","Shuanglin Yan","Rui Yan","Zechao Li","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2308.03063v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.13923v2","updated":"2023-08-06T08:06:43Z","published":"2023-04-27T02:23:47Z","title":"Retrieval-based Knowledge Augmented Vision Language Pre-training","summary":"  With the recent progress in large-scale vision and language representation\nlearning, Vision Language Pre-training (VLP) models have achieved promising\nimprovements on various multi-modal downstream tasks. Albeit powerful, these\nmodels have not fully leveraged world knowledge to their advantage. A key\nchallenge of knowledge-augmented VLP is the lack of clear connections between\nknowledge and multi-modal data. Moreover, not all knowledge present in\nimages/texts is useful, therefore prior approaches often struggle to\neffectively integrate knowledge, visual, and textual information. In this\nstudy, we propose REtrieval-based knowledge Augmented Vision Language (REAVL),\na novel knowledge-augmented pre-training framework to address the above issues.\nFor the first time, we introduce a knowledge-aware self-supervised learning\nscheme that efficiently establishes the correspondence between knowledge and\nmulti-modal data and identifies informative knowledge to improve the modeling\nof alignment and interactions between visual and textual modalities. By\nadaptively integrating informative knowledge with visual and textual\ninformation, REAVL achieves new state-of-the-art performance uniformly on\nknowledge-based vision-language understanding and multi-modal entity linking\ntasks, as well as competitive results on general vision-language tasks while\nonly using 0.2% pre-training data of the best models. Our model shows strong\nsample efficiency and effective knowledge utilization.\n","authors":["Jiahua Rao","Zifei Shan","Longpo Liu","Yao Zhou","Yuedong Yang"],"pdf_url":"https://arxiv.org/pdf/2304.13923v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.09338 by other authors"},{"id":"http://arxiv.org/abs/2302.03242v3","updated":"2023-08-06T05:37:37Z","published":"2023-02-07T04:03:55Z","title":"Combating Online Misinformation Videos: Characterization, Detection, and\n  Future Directions","summary":"  With information consumption via online video streaming becoming increasingly\npopular, misinformation video poses a new threat to the health of the online\ninformation ecosystem. Though previous studies have made much progress in\ndetecting misinformation in text and image formats, video-based misinformation\nbrings new and unique challenges to automatic detection systems: 1) high\ninformation heterogeneity brought by various modalities, 2) blurred distinction\nbetween misleading video manipulation and nonmalicious artistic video editing,\nand 3) new patterns of misinformation propagation due to the dominant role of\nrecommendation systems on online video platforms. To facilitate research on\nthis challenging task, we conduct this survey to present advances in\nmisinformation video detection. We first analyze and characterize the\nmisinformation video from three levels including signals, semantics, and\nintents. Based on the characterization, we systematically review existing works\nfor detection from features of various modalities to techniques for clue\nintegration. We also introduce existing resources including representative\ndatasets and useful tools. Besides summarizing existing studies, we discuss\nrelated areas and outline open issues and future directions to encourage and\nguide more research on misinformation video detection. The corresponding\nrepository is at https://github.com/ICTMCG/Awesome-Misinfo-Video-Detection.\n","authors":["Yuyan Bu","Qiang Sheng","Juan Cao","Peng Qi","Danding Wang","Jintao Li"],"pdf_url":"https://arxiv.org/pdf/2302.03242v3.pdf","comment":"Accepted at ACM Multimedia 2023 (MM 2023). 11 pages, 4 figures, and\n  89 references"},{"id":"http://arxiv.org/abs/2308.03024v1","updated":"2023-08-06T05:23:25Z","published":"2023-08-06T05:23:25Z","title":"Towards Scene-Text to Scene-Text Translation","summary":"  In this work, we study the task of ``visually\" translating scene text from a\nsource language (e.g., English) to a target language (e.g., Chinese). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe text, such as font, size, and background. There are several challenges\nassociated with this task, such as interpolating font to unseen characters and\npreserving text size and the background. To address these, we introduce VTNet,\na novel conditional diffusion-based method. To train the VTNet, we create a\nsynthetic cross-lingual dataset of 600K samples of scene text images in six\npopular languages, including English, Hindi, Tamil, Chinese, Bengali, and\nGerman. We evaluate the performance of VTnet through extensive experiments and\ncomparisons to related methods. Our model also surpasses the previous\nstate-of-the-art results on the conventional scene-text editing benchmarks.\nFurther, we present rigorous qualitative studies to understand the strengths\nand shortcomings of our model. Results show that our approach generalizes well\nto unseen words and fonts. We firmly believe our work can benefit real-world\napplications, such as text translation using a phone camera and translating\neducational materials. Code and data will be made publicly available.\n","authors":["Onkar Susladkar","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03000v1","updated":"2023-08-06T03:22:46Z","published":"2023-08-06T03:22:46Z","title":"StyleEDL: Style-Guided High-order Attention Network for Image Emotion\n  Distribution Learning","summary":"  Emotion distribution learning has gained increasing attention with the\ntendency to express emotions through images. As for emotion ambiguity arising\nfrom humans' subjectivity, substantial previous methods generally focused on\nlearning appropriate representations from the holistic or significant part of\nimages. However, they rarely consider establishing connections with the\nstylistic information although it can lead to a better understanding of images.\nIn this paper, we propose a style-guided high-order attention network for image\nemotion distribution learning termed StyleEDL, which interactively learns\nstylistic-aware representations of images by exploring the hierarchical\nstylistic information of visual contents. Specifically, we consider exploring\nthe intra- and inter-layer correlations among GRAM-based stylistic\nrepresentations, and meanwhile exploit an adversary-constrained high-order\nattention mechanism to capture potential interactions between subtle visual\nparts. In addition, we introduce a stylistic graph convolutional network to\ndynamically generate the content-dependent emotion representations to benefit\nthe final emotion distribution learning. Extensive experiments conducted on\nseveral benchmark datasets demonstrate the effectiveness of our proposed\nStyleEDL compared to state-of-the-art methods. The implementation is released\nat: https://github.com/liuxianyi/StyleEDL.\n","authors":["Peiguang Jing","Xianyi Liu","Ji Wang","Yinwei Wei","Liqiang Nie","Yuting Su"],"pdf_url":"https://arxiv.org/pdf/2308.03000v1.pdf","comment":"8 pages, 5 figures, conference"}]},"2023-08-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.02962v1","updated":"2023-08-05T22:19:03Z","published":"2023-08-05T22:19:03Z","title":"Science and engineering for what? A large-scale analysis of students'\n  projects in science fairs","summary":"  Science and Engineering fairs offer K-12 students opportunities to engage\nwith authentic STEM practices. Particularly, students are given the chance to\nexperience authentic and open inquiry processes, by defining which themes,\nquestions and approaches will guide their scientific endeavors. In this study,\nwe analyzed data from over 5,000 projects presented at a nationwide science\nfair in Brazil over the past 20 years using topic modeling to identify the main\ntopics that have driven students' inquiry and design. Our analysis identified a\nbroad range of topics being explored, with significant variations over time,\nregion, and school setting. We argue those results and proposed methodology can\nnot only support further research in the context of science fairs, but also\ninform instruction and design of contexts-specific resources to support\nstudents in open inquiry experiences in different settings.\n","authors":["Adelmo Eloy","Thomas Palmeira Ferraz","Fellip Silva Alves","Roseli de Deus Lopes"],"pdf_url":"https://arxiv.org/pdf/2308.02962v1.pdf","comment":"Presented at International Conference of the Learning Sciences - ICLS\n  2023"},{"id":"http://arxiv.org/abs/2308.02951v1","updated":"2023-08-05T20:33:39Z","published":"2023-08-05T20:33:39Z","title":"Multi-Source (Pre-)Training for Cross-Domain Measurement, Unit and\n  Context Extraction","summary":"  We present a cross-domain approach for automated measurement and context\nextraction based on pre-trained language models. We construct a multi-source,\nmulti-domain corpus and train an end-to-end extraction pipeline. We then apply\nmulti-source task-adaptive pre-training and fine-tuning to benchmark the\ncross-domain generalization capability of our model. Further, we conceptualize\nand apply a task-specific error analysis and derive insights for future work.\nOur results suggest that multi-source training leads to the best overall\nresults, while single-source training yields the best results for the\nrespective individual domain. While our setup is successful at extracting\nquantity values and units, more research is needed to improve the extraction of\ncontextual entities. We make the cross-domain corpus used in this work\navailable online.\n","authors":["Yueling Li","Sebastian Martschat","Simone Paolo Ponzetto"],"pdf_url":"https://arxiv.org/pdf/2308.02951v1.pdf","comment":"Published as a workshop paper at BioNLP 2023"},{"id":"http://arxiv.org/abs/2308.02926v1","updated":"2023-08-05T17:33:51Z","published":"2023-08-05T17:33:51Z","title":"Towards Consistency Filtering-Free Unsupervised Learning for Dense\n  Retrieval","summary":"  Domain transfer is a prevalent challenge in modern neural Information\nRetrieval (IR). To overcome this problem, previous research has utilized\ndomain-specific manual annotations and synthetic data produced by consistency\nfiltering to finetune a general ranker and produce a domain-specific ranker.\nHowever, training such consistency filters are computationally expensive, which\nsignificantly reduces the model efficiency. In addition, consistency filtering\noften struggles to identify retrieval intentions and recognize query and corpus\ndistributions in a target domain. In this study, we evaluate a more efficient\nsolution: replacing the consistency filter with either direct pseudo-labeling,\npseudo-relevance feedback, or unsupervised keyword generation methods for\nachieving consistent filtering-free unsupervised dense retrieval. Our extensive\nexperimental evaluations demonstrate that, on average, TextRank-based pseudo\nrelevance feedback outperforms other methods. Furthermore, we analyzed the\ntraining and inference efficiency of the proposed paradigm. The results\nindicate that filtering-free unsupervised learning can continuously improve\ntraining and inference efficiency while maintaining retrieval performance. In\nsome cases, it can even improve performance based on particular datasets.\n","authors":["Haoxiang Shi","Sumio Fujita","Tetsuya Sakai"],"pdf_url":"https://arxiv.org/pdf/2308.02926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02903v1","updated":"2023-08-05T15:51:45Z","published":"2023-08-05T15:51:45Z","title":"LaDA: Latent Dialogue Action For Zero-shot Cross-lingual Neural Network\n  Language Modeling","summary":"  Cross-lingual adaptation has proven effective in spoken language\nunderstanding (SLU) systems with limited resources. Existing methods are\nfrequently unsatisfactory for intent detection and slot filling, particularly\nfor distant languages that differ significantly from the source language in\nscripts, morphology, and syntax. Latent Dialogue Action (LaDA) layer is\nproposed to optimize decoding strategy in order to address the aforementioned\nissues. The model consists of an additional layer of latent dialogue action. It\nenables our model to improve a system's capability of handling conversations\nwith complex multilingual intent and slot values of distant languages. To the\nbest of our knowledge, this is the first exhaustive investigation of the use of\nlatent variables for optimizing cross-lingual SLU policy during the decode\nstage. LaDA obtains state-of-the-art results on public datasets for both\nzero-shot and few-shot adaptation.\n","authors":["Zhanyu Ma","Jian Ye","Shuang Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.02903v1.pdf","comment":"Accepted In Proceedings of Cognitive Science Society Annual\n  Conference (CogSci) 2023"},{"id":"http://arxiv.org/abs/2104.04125v2","updated":"2023-08-05T15:10:29Z","published":"2021-04-09T00:50:57Z","title":"Design and Implementation of English To Yorùbá Verb Phrase Machine\n  Translation System","summary":"  We aim to develop an English-to-Yoruba machine translation system which can\ntranslate English verb phrase text to its Yoruba equivalent.Words from both\nlanguages Source Language and Target Language were collected for the verb\nphrase group in the home domain. The lexical translation is done by assigning\nvalues of the matching word in the dictionary. The syntax of the two languages\nwas realized using Context-Free Grammar, we validated the rewrite rules with\nfinite state automata. The human evaluation method was used and expert fluency\nwas scored. The evaluation shows the system performed better than that of\nsampled Google translation with over 70 percent of the response matching that\nof the system's output.\n","authors":["Benjamin Ajibade","Safiriyu Eludiora"],"pdf_url":"https://arxiv.org/pdf/2104.04125v2.pdf","comment":"Accepted for the African NLP Workshop at the 16th conference of the\n  European Chapter of the Association for Computational Linguistics (EACL) in\n  2021"},{"id":"http://arxiv.org/abs/2308.02870v1","updated":"2023-08-05T12:50:54Z","published":"2023-08-05T12:50:54Z","title":"ApproBiVT: Lead ASR Models to Generalize Better Using Approximated\n  Bias-Variance Tradeoff Guided Early Stopping and Checkpoint Averaging","summary":"  The conventional recipe for Automatic Speech Recognition (ASR) models is to\n1) train multiple checkpoints on a training set while relying on a validation\nset to prevent overfitting using early stopping and 2) average several last\ncheckpoints or that of the lowest validation losses to obtain the final model.\nIn this paper, we rethink and update the early stopping and checkpoint\naveraging from the perspective of the bias-variance tradeoff. Theoretically,\nthe bias and variance represent the fitness and variability of a model and the\ntradeoff of them determines the overall generalization error. But, it's\nimpractical to evaluate them precisely. As an alternative, we take the training\nloss and validation loss as proxies of bias and variance and guide the early\nstopping and checkpoint averaging using their tradeoff, namely an Approximated\nBias-Variance Tradeoff (ApproBiVT). When evaluating with advanced ASR models,\nour recipe provides 2.5%-3.7% and 3.1%-4.6% CER reduction on the AISHELL-1 and\nAISHELL-2, respectively.\n","authors":["Fangyuan Wang","Ming Hao","Yuhai Shi","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2308.02870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11403v3","updated":"2023-08-05T08:25:01Z","published":"2023-03-20T19:20:34Z","title":"eP-ALM: Efficient Perceptual Augmentation of Language Models","summary":"  Large Language Models (LLMs) have so far impressed the world, with\nunprecedented capabilities that emerge in models at large scales. On the vision\nside, transformer models (i.e., ViT) are following the same trend, achieving\nthe best performance on challenging benchmarks. With the abundance of such\nunimodal models, a natural question arises; do we need also to follow this\ntrend to tackle multimodal tasks? In this work, we propose to rather direct\neffort to efficient adaptations of existing models, and propose to augment\nLanguage Models with perception. Existing approaches for adapting pretrained\nmodels for vision-language tasks still rely on several key components that\nhinder their efficiency. In particular, they still train a large number of\nparameters, rely on large multimodal pretraining, use encoders (e.g., CLIP)\ntrained on huge image-text datasets, and add significant inference overhead. In\naddition, most of these approaches have focused on Zero-Shot and In Context\nLearning, with little to no effort on direct finetuning. We investigate the\nminimal computational effort needed to adapt unimodal models for multimodal\ntasks and propose a new challenging setup, alongside different approaches, that\nefficiently adapts unimodal pretrained models. We show that by freezing more\nthan 99% of total parameters, training only one linear projection layer, and\nprepending only one trainable token, our approach (dubbed eP-ALM) significantly\noutperforms other baselines on VQA and Captioning across Image, Video, and\nAudio modalities, following the proposed setup. The code is available here:\nhttps://github.com/mshukor/eP-ALM.\n","authors":["Mustafa Shukor","Corentin Dancette","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2303.11403v3.pdf","comment":"Accepted at ICCV 2023. Project page:\n  https://mshukor.github.io/eP-ALM.github.io/"},{"id":"http://arxiv.org/abs/2106.08914v2","updated":"2023-08-05T08:04:15Z","published":"2021-06-16T16:05:27Z","title":"$C^3$: Compositional Counterfactual Contrastive Learning for\n  Video-grounded Dialogues","summary":"  Video-grounded dialogue systems aim to integrate video understanding and\ndialogue understanding to generate responses that are relevant to both the\ndialogue and video context. Most existing approaches employ deep learning\nmodels and have achieved remarkable performance, given the relatively small\ndatasets available. However, the results are partly accomplished by exploiting\nbiases in the datasets rather than developing multimodal reasoning, resulting\nin limited generalization. In this paper, we propose a novel approach of\nCompositional Counterfactual Contrastive Learning ($C^3$) to develop\ncontrastive training between factual and counterfactual samples in\nvideo-grounded dialogues. Specifically, we design factual/counterfactual\nsampling based on the temporal steps in videos and tokens in dialogues and\npropose contrastive loss functions that exploit object-level or action-level\nvariance. Different from prior approaches, we focus on contrastive hidden state\nrepresentations among compositional output tokens to optimize the\nrepresentation space in a generation setting. We achieved promising performance\ngains on the Audio-Visual Scene-Aware Dialogues (AVSD) benchmark and showed the\nbenefits of our approach in grounding video and dialogue context.\n","authors":["Hung Le","Nancy F. Chen","Steven C. H. Hoi"],"pdf_url":"https://arxiv.org/pdf/2106.08914v2.pdf","comment":"24th Meeting of the Special Interest Group on Discourse and Dialogue\n  (SIGDIAL)"},{"id":"http://arxiv.org/abs/2307.02046v2","updated":"2023-08-05T05:49:13Z","published":"2023-07-05T06:03:40Z","title":"Recommender Systems in the Era of Large Language Models (LLMs)","summary":"  With the prosperity of e-commerce and web applications, Recommender Systems\n(RecSys) have become an important component of our daily life, providing\npersonalized suggestions that cater to user preferences. While Deep Neural\nNetworks (DNNs) have made significant advancements in enhancing recommender\nsystems by modeling user-item interactions and incorporating textual side\ninformation, DNN-based methods still face limitations, such as difficulties in\nunderstanding users' interests and capturing textual side information,\ninabilities in generalizing to various recommendation scenarios and reasoning\non their predictions, etc. Meanwhile, the emergence of Large Language Models\n(LLMs), such as ChatGPT and GPT4, has revolutionized the fields of Natural\nLanguage Processing (NLP) and Artificial Intelligence (AI), due to their\nremarkable abilities in fundamental responsibilities of language understanding\nand generation, as well as impressive generalization and reasoning\ncapabilities. As a result, recent studies have attempted to harness the power\nof LLMs to enhance recommender systems. Given the rapid evolution of this\nresearch direction in recommender systems, there is a pressing need for a\nsystematic overview that summarizes existing LLM-empowered recommender systems,\nto provide researchers in relevant fields with an in-depth understanding.\nTherefore, in this paper, we conduct a comprehensive review of LLM-empowered\nrecommender systems from various aspects including Pre-training, Fine-tuning,\nand Prompting. More specifically, we first introduce representative methods to\nharness the power of LLMs (as a feature encoder) for learning representations\nof users and items. Then, we review recent techniques of LLMs for enhancing\nrecommender systems from three paradigms, namely pre-training, fine-tuning, and\nprompting. Finally, we comprehensively discuss future directions in this\nemerging field.\n","authors":["Wenqi Fan","Zihuai Zhao","Jiatong Li","Yunqing Liu","Xiaowei Mei","Yiqi Wang","Zhen Wen","Fei Wang","Xiangyu Zhao","Jiliang Tang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2307.02046v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2306.01153v2","updated":"2023-08-05T05:48:41Z","published":"2023-06-01T21:23:13Z","title":"Diverse and Faithful Knowledge-Grounded Dialogue Generation via\n  Sequential Posterior Inference","summary":"  The capability to generate responses with diversity and faithfulness using\nfactual knowledge is paramount for creating a human-like, trustworthy dialogue\nsystem. Common strategies either adopt a two-step paradigm, which optimizes\nknowledge selection and response generation separately, and may overlook the\ninherent correlation between these two tasks, or leverage conditional\nvariational method to jointly optimize knowledge selection and response\ngeneration by employing an inference network. In this paper, we present an\nend-to-end learning framework, termed Sequential Posterior Inference (SPI),\ncapable of selecting knowledge and generating dialogues by approximately\nsampling from the posterior distribution. Unlike other methods, SPI does not\nrequire the inference network or assume a simple geometry of the posterior\ndistribution. This straightforward and intuitive inference procedure of SPI\ndirectly queries the response generation model, allowing for accurate knowledge\nselection and generation of faithful responses. In addition to modeling\ncontributions, our experimental results on two common dialogue datasets (Wizard\nof Wikipedia and Holl-E) demonstrate that SPI outperforms previous strong\nbaselines according to both automatic and human evaluation metrics.\n","authors":["Yan Xu","Deqian Kong","Dehong Xu","Ziwei Ji","Bo Pang","Pascale Fung","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2306.01153v2.pdf","comment":"Accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2302.12324v2","updated":"2023-08-05T03:55:33Z","published":"2023-02-23T20:39:06Z","title":"Summaries as Captions: Generating Figure Captions for Scientific\n  Documents with Automated Text Summarization","summary":"  Good figure captions help paper readers understand complex scientific\nfigures. Unfortunately, even published papers often have poorly written\ncaptions. Automatic caption generation could aid paper writers by providing\ngood starting captions that can be refined for better quality. Prior work often\ntreated figure caption generation as a vision-to-language task. In this paper,\nwe show that it can be more effectively tackled as a text summarization task in\nscientific documents. We fine-tuned PEGASUS, a pre-trained abstractive\nsummarization model, to specifically summarize figure-referencing paragraphs\n(e.g., \"Figure 3 shows...\") into figure captions. Experiments on large-scale\narXiv figures show that our method outperforms prior vision methods in both\nautomatic and human evaluations. We further conducted an in-depth investigation\nfocused on two key challenges: (i) the common presence of low-quality\nauthor-written captions and (ii) the lack of clear standards for good captions.\nOur code and data are available at:\nhttps://github.com/Crowd-AI-Lab/Generating-Figure-Captions-as-a-Text-Summarization-Task.\n","authors":["Chieh-Yang Huang","Ting-Yao Hsu","Ryan Rossi","Ani Nenkova","Sungchul Kim","Gromit Yeuk-Yin Chan","Eunyee Koh","Clyde Lee Giles","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2302.12324v2.pdf","comment":"Accepted by INLG-2023"},{"id":"http://arxiv.org/abs/2303.16756v2","updated":"2023-08-05T03:03:41Z","published":"2023-03-24T03:14:00Z","title":"Large Language Models for Healthcare Data Augmentation: An Example on\n  Patient-Trial Matching","summary":"  The process of matching patients with suitable clinical trials is essential\nfor advancing medical research and providing optimal care. However, current\napproaches face challenges such as data standardization, ethical\nconsiderations, and a lack of interoperability between Electronic Health\nRecords (EHRs) and clinical trial criteria. In this paper, we explore the\npotential of large language models (LLMs) to address these challenges by\nleveraging their advanced natural language generation capabilities to improve\ncompatibility between EHRs and clinical trial descriptions. We propose an\ninnovative privacy-aware data augmentation approach for LLM-based patient-trial\nmatching (LLM-PTM), which balances the benefits of LLMs while ensuring the\nsecurity and confidentiality of sensitive patient data. Our experiments\ndemonstrate a 7.32% average improvement in performance using the proposed\nLLM-PTM method, and the generalizability to new data is improved by 12.12%.\nAdditionally, we present case studies to further illustrate the effectiveness\nof our approach and provide a deeper understanding of its underlying\nprinciples.\n","authors":["Jiayi Yuan","Ruixiang Tang","Xiaoqian Jiang","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2303.16756v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02773v1","updated":"2023-08-05T02:55:35Z","published":"2023-08-05T02:55:35Z","title":"EduChat: A Large-Scale Language Model-based Chatbot System for\n  Intelligent Education","summary":"  EduChat (https://www.educhat.top/) is a large-scale language model\n(LLM)-based chatbot system in the education domain. Its goal is to support\npersonalized, fair, and compassionate intelligent education, serving teachers,\nstudents, and parents. Guided by theories from psychology and education, it\nfurther strengthens educational functions such as open question answering,\nessay assessment, Socratic teaching, and emotional support based on the\nexisting basic LLMs. Particularly, we learn domain-specific knowledge by\npre-training on the educational corpus and stimulate various skills with tool\nuse by fine-tuning on designed system prompts and instructions. Currently,\nEduChat is available online as an open-source project, with its code, data, and\nmodel parameters available on platforms (e.g., GitHub\nhttps://github.com/icalk-nlp/EduChat, Hugging Face\nhttps://huggingface.co/ecnu-icalk ). We also prepare a demonstration of its\ncapabilities online (https://vimeo.com/851004454). This initiative aims to\npromote research and applications of LLMs for intelligent education.\n","authors":["Yuhao Dan","Zhikai Lei","Yiyang Gu","Yong Li","Jianghao Yin","Jiaju Lin","Linhao Ye","Zhiyan Tie","Yougen Zhou","Yilei Wang","Aimin Zhou","Ze Zhou","Qin Chen","Jie Zhou","Liang He","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.02773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08941v2","updated":"2023-08-05T01:10:06Z","published":"2023-07-18T03:12:51Z","title":"NTK-approximating MLP Fusion for Efficient Language Model Fine-tuning","summary":"  Fine-tuning a pre-trained language model (PLM) emerges as the predominant\nstrategy in many natural language processing applications. However, even\nfine-tuning the PLMs and doing inference are expensive, especially on edge\ndevices with low computing power. Some general approaches (e.g. quantization\nand distillation) have been widely studied to reduce the compute/memory of PLM\nfine-tuning, while very few one-shot compression techniques are explored. In\nthis paper, we investigate the neural tangent kernel (NTK)--which reveals the\ngradient descent dynamics of neural networks--of the multilayer perceptrons\n(MLP) modules in a PLM and propose to coin a lightweight PLM through\nNTK-approximating MLP fusion. To achieve this, we reconsider the MLP as a\nbundle of sub-MLPs, and cluster them into a given number of centroids, which\ncan then be restored as a compressed MLP and surprisingly shown to well\napproximate the NTK of the original PLM. Extensive experiments of PLM\nfine-tuning on both natural language understanding (NLU) and generation (NLG)\ntasks are provided to verify the effectiveness of the proposed method MLP\nfusion. Our code is available at https://github.com/weitianxin/MLP_Fusion.\n","authors":["Tianxin Wei","Zeming Guo","Yifan Chen","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2307.08941v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.03800v1","updated":"2023-08-05T15:33:10Z","published":"2023-08-05T15:33:10Z","title":"Textual Data Mining for Financial Fraud Detection: A Deep Learning\n  Approach","summary":"  In this report, I present a deep learning approach to conduct a natural\nlanguage processing (hereafter NLP) binary classification task for analyzing\nfinancial-fraud texts. First, I searched for regulatory announcements and\nenforcement bulletins from HKEX news to define fraudulent companies and to\nextract their MD&A reports before I organized the sentences from the reports\nwith labels and reporting time. My methodology involved different kinds of\nneural network models, including Multilayer Perceptrons with Embedding layers,\nvanilla Recurrent Neural Network (RNN), Long-Short Term Memory (LSTM), and\nGated Recurrent Unit (GRU) for the text classification task. By utilizing this\ndiverse set of models, I aim to perform a comprehensive comparison of their\naccuracy in detecting financial fraud. My results bring significant\nimplications for financial fraud detection as this work contributes to the\ngrowing body of research at the intersection of deep learning, NLP, and\nfinance, providing valuable insights for industry practitioners, regulators,\nand researchers in the pursuit of more robust and effective fraud detection\nmethodologies.\n","authors":["Qiuru Li"],"pdf_url":"https://arxiv.org/pdf/2308.03800v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2205.06205v3","updated":"2023-08-05T19:10:12Z","published":"2022-05-12T16:42:24Z","title":"kNN-Embed: Locally Smoothed Embedding Mixtures For Multi-interest\n  Candidate Retrieval","summary":"  Candidate retrieval is the first stage in recommendation systems, where a\nlight-weight system is used to retrieve potentially relevant items for an input\nuser. These candidate items are then ranked and pruned in later stages of\nrecommender systems using a more complex ranking model. As the top of the\nrecommendation funnel, it is important to retrieve a high-recall candidate set\nto feed into downstream ranking models. A common approach is to leverage\napproximate nearest neighbor (ANN) search from a single dense query embedding;\nhowever, this approach this can yield a low-diversity result set with many near\nduplicates. As users often have multiple interests, candidate retrieval should\nideally return a diverse set of candidates reflective of the user's multiple\ninterests. To this end, we introduce kNN-Embed, a general approach to improving\ndiversity in dense ANN-based retrieval. kNN-Embed represents each user as a\nsmoothed mixture over learned item clusters that represent distinct \"interests\"\nof the user. By querying each of a user's mixture component in proportion to\ntheir mixture weights, we retrieve a high-diversity set of candidates\nreflecting elements from each of a user's interests. We experimentally compare\nkNN-Embed to standard ANN candidate retrieval, and show significant\nimprovements in overall recall and improved diversity across three datasets.\nAccompanying this work, we open source a large Twitter follow-graph dataset\n(https://huggingface.co/datasets/Twitter/TwitterFollowGraph), to spur further\nresearch in graph-mining and representation learning for recommender systems.\n","authors":["Ahmed El-Kishky","Thomas Markovich","Kenny Leung","Frank Portman","Aria Haghighi","Ying Xiao"],"pdf_url":"https://arxiv.org/pdf/2205.06205v3.pdf","comment":"Pacific-Asia Conference on Knowledge Discovery and Data Mining. Cham:\n  Springer Nature Switzerland, 2023 (PAKDD 2023)"},{"id":"http://arxiv.org/abs/2308.02926v1","updated":"2023-08-05T17:33:51Z","published":"2023-08-05T17:33:51Z","title":"Towards Consistency Filtering-Free Unsupervised Learning for Dense\n  Retrieval","summary":"  Domain transfer is a prevalent challenge in modern neural Information\nRetrieval (IR). To overcome this problem, previous research has utilized\ndomain-specific manual annotations and synthetic data produced by consistency\nfiltering to finetune a general ranker and produce a domain-specific ranker.\nHowever, training such consistency filters are computationally expensive, which\nsignificantly reduces the model efficiency. In addition, consistency filtering\noften struggles to identify retrieval intentions and recognize query and corpus\ndistributions in a target domain. In this study, we evaluate a more efficient\nsolution: replacing the consistency filter with either direct pseudo-labeling,\npseudo-relevance feedback, or unsupervised keyword generation methods for\nachieving consistent filtering-free unsupervised dense retrieval. Our extensive\nexperimental evaluations demonstrate that, on average, TextRank-based pseudo\nrelevance feedback outperforms other methods. Furthermore, we analyzed the\ntraining and inference efficiency of the proposed paradigm. The results\nindicate that filtering-free unsupervised learning can continuously improve\ntraining and inference efficiency while maintaining retrieval performance. In\nsome cases, it can even improve performance based on particular datasets.\n","authors":["Haoxiang Shi","Sumio Fujita","Tetsuya Sakai"],"pdf_url":"https://arxiv.org/pdf/2308.02926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02925v1","updated":"2023-08-05T17:33:17Z","published":"2023-08-05T17:33:17Z","title":"ConvFormer: Revisiting Transformer for Sequential User Modeling","summary":"  Sequential user modeling, a critical task in personalized recommender\nsystems, focuses on predicting the next item a user would prefer, requiring a\ndeep understanding of user behavior sequences. Despite the remarkable success\nof Transformer-based models across various domains, their full potential in\ncomprehending user behavior remains untapped. In this paper, we re-examine\nTransformer-like architectures aiming to advance state-of-the-art performance.\nWe start by revisiting the core building blocks of Transformer-based methods,\nanalyzing the effectiveness of the item-to-item mechanism within the context of\nsequential user modeling. After conducting a thorough experimental analysis, we\nidentify three essential criteria for devising efficient sequential user\nmodels, which we hope will serve as practical guidelines to inspire and shape\nfuture designs. Following this, we introduce ConvFormer, a simple but powerful\nmodification to the Transformer architecture that meets these criteria,\nyielding state-of-the-art results. Additionally, we present an acceleration\ntechnique to minimize the complexity associated with processing extremely long\nsequences. Experiments on four public datasets showcase ConvFormer's\nsuperiority and confirm the validity of our proposed criteria.\n","authors":["Hao Wang","Jianxun Lian","Mingqi Wu","Haoxuan Li","Jiajun Fan","Wanyue Xu","Chaozhuo Li","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02900v1","updated":"2023-08-05T15:23:26Z","published":"2023-08-05T15:23:26Z","title":"Disentangled Counterfactual Reasoning for Unbiased Sequential\n  Recommendation","summary":"  Sequential recommender systems have achieved state-of-the-art recommendation\nperformance by modeling the sequential dynamics of user activities. However, in\nmost recommendation scenarios, the popular items comprise the major part of the\nprevious user actions. Therefore, the learned models are biased towards the\npopular items irrespective of the user's real interests. In this paper, we\npropose a structural causal model-based method to address the popularity bias\nissue for sequential recommendation model learning. For more generalizable\nmodeling, we disentangle the popularity and interest representations at both\nthe item side and user context side. Based on the disentangled representation,\nwe identify a more effective structural causal graph for general recommendation\napplications. Then, we design delicate sequential models to apply the\naforementioned causal graph to the sequential recommendation scenario for\nunbiased prediction with counterfactual reasoning. Furthermore, we conduct\nextensive offline experiments and online A/B tests to verify the proposed\n\\textbf{DCR} (Disentangled Counterfactual Reasoning) method's superior overall\nperformance and understand the effectiveness of the various introduced\ncomponents. Based on our knowledge, this is the first structural causal model\nspecifically designed for the popularity bias correction of sequential\nrecommendation models, which achieves significant performance gains over the\nexisting methods.\n","authors":["Yi Ren","Xu Zhao","Hongyan Tang","Shuai Li"],"pdf_url":"https://arxiv.org/pdf/2308.02900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02887v1","updated":"2023-08-05T14:08:42Z","published":"2023-08-05T14:08:42Z","title":"Group Membership Bias","summary":"  When learning to rank from user interactions, search and recommendation\nsystems must address biases in user behavior to provide a high-quality ranking.\nOne type of bias that has recently been studied in the ranking literature is\nwhen sensitive attributes, such as gender, have an impact on a user's judgment\nabout an item's utility. For example, in a search for an expertise area, some\nusers may be biased towards clicking on male candidates over female candidates.\nWe call this type of bias group membership bias or group bias for short.\nIncreasingly, we seek rankings that not only have high utility but are also\nfair to individuals and sensitive groups. Merit-based fairness measures rely on\nthe estimated merit or utility of the items. With group bias, the utility of\nthe sensitive groups is under-estimated, hence, without correcting for this\nbias, a supposedly fair ranking is not truly fair. In this paper, first, we\nanalyze the impact of group bias on ranking quality as well as two well-known\nmerit-based fairness metrics and show that group bias can hurt both ranking and\nfairness. Then, we provide a correction method for group bias that is based on\nthe assumption that the utility score of items in different groups comes from\nthe same distribution. This assumption has two potential issues of sparsity and\nequality-instead-of-equity, which we use an amortized approach to solve. We\nshow that our correction method can consistently compensate for the negative\nimpact of group bias on ranking quality and fairness metrics.\n","authors":["Ali Vardasbi","Maarten de Rijke","Fernando Diaz","Mostafa Dehghani"],"pdf_url":"https://arxiv.org/pdf/2308.02887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02936v2","updated":"2023-08-05T13:13:20Z","published":"2023-07-06T12:02:38Z","title":"A Meta-Evaluation of C/W/L/A Metrics: System Ranking Similarity, System\n  Ranking Consistency and Discriminative Power","summary":"  Recently, Moffat et al. proposed an analytic framework, namely C/W/L/A, for\noffline evaluation metrics. This framework allows information retrieval (IR)\nresearchers to design evaluation metrics through the flexible combination of\nuser browsing models and user gain aggregations. However, the statistical\nstability of C/W/L/A metrics with different aggregations is not yet\ninvestigated. In this study, we investigate the statistical stability of\nC/W/L/A metrics from the perspective of: (1) the system ranking similarity\namong aggregations, (2) the system ranking consistency of aggregations and (3)\nthe discriminative power of aggregations. More specifically, we combined\nvarious aggregation functions with the browsing model of Precision, Discounted\nCumulative Gain (DCG), Rank-Biased Precision (RBP), INST, Average Precision\n(AP) and Expected Reciprocal Rank (ERR), examing their performances in terms of\nsystem ranking similarity, system ranking consistency and discriminative power\non two offline test collections. Our experimental result suggests that, in\nterms of system ranking consistency and discriminative power, the aggregation\nfunction of expected rate of gain (ERG) has an outstanding performance while\nthe aggregation function of maximum relevance usually has an insufficient\nperformance. The result also suggests that Precision, DCG, RBP, INST and AP\nwith their canonical aggregation all have favourable performances in system\nranking consistency and discriminative power; but for ERR, replacing its\ncanonical aggregation with ERG can further strengthen the discriminative power\nwhile obtaining a system ranking list similar to the canonical version at the\nsame time.\n","authors":["Nuo Chen","Tetsuya Sakai"],"pdf_url":"https://arxiv.org/pdf/2307.02936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02860v1","updated":"2023-08-05T12:22:26Z","published":"2023-08-05T12:22:26Z","title":"Replace Scoring with Arrangement: A Contextual Set-to-Arrangement\n  Framework for Learning-to-Rank","summary":"  Learning-to-rank is a core technique in the top-N recommendation task, where\nan ideal ranker would be a mapping from an item set to an arrangement (a.k.a.\npermutation). Most existing solutions fall in the paradigm of probabilistic\nranking principle (PRP), i.e., first score each item in the candidate set and\nthen perform a sort operation to generate the top ranking list. However, these\napproaches neglect the contextual dependence among candidate items during\nindividual scoring, and the sort operation is non-differentiable. To bypass the\nabove issues, we propose Set-To-Arrangement Ranking (STARank), a new framework\ndirectly generates the permutations of the candidate items without the need for\nindividually scoring and sort operations; and is end-to-end differentiable. As\na result, STARank can operate when only the ground-truth permutations are\naccessible without requiring access to the ground-truth relevance scores for\nitems. For this purpose, STARank first reads the candidate items in the context\nof the user browsing history, whose representations are fed into a\nPlackett-Luce module to arrange the given items into a list. To effectively\nutilize the given ground-truth permutations for supervising STARank, we\nleverage the internal consistency property of Plackett-Luce models to derive a\ncomputationally efficient list-wise loss. Experimental comparisons against 9\nthe state-of-the-art methods on 2 learning-to-rank benchmark datasets and 3\ntop-N real-world recommendation datasets demonstrate the superiority of STARank\nin terms of conventional ranking metrics. Notice that these ranking metrics do\nnot consider the effects of the contextual dependence among the items in the\nlist, we design a new family of simulation-based ranking metrics, where\nexisting metrics can be regarded as special cases. STARank can consistently\nachieve better performance in terms of PBM and UBM simulation-based metrics.\n","authors":["Jiarui Jin","Xianyu Chen","Weinan Zhang","Mengyue Yang","Yang Wang","Yali Du","Yong Yu","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02860v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.02844v1","updated":"2023-08-05T11:13:59Z","published":"2023-08-05T11:13:59Z","title":"Bootstrapping Contrastive Learning Enhanced Music Cold-Start Matching","summary":"  We study a particular matching task we call Music Cold-Start Matching. In\nshort, given a cold-start song request, we expect to retrieve songs with\nsimilar audiences and then fastly push the cold-start song to the audiences of\nthe retrieved songs to warm up it. However, there are hardly any studies done\non this task. Therefore, in this paper, we will formalize the problem of Music\nCold-Start Matching detailedly and give a scheme. During the offline training,\nwe attempt to learn high-quality song representations based on song content\nfeatures. But, we find supervision signals typically follow power-law\ndistribution causing skewed representation learning. To address this issue, we\npropose a novel contrastive learning paradigm named Bootstrapping Contrastive\nLearning (BCL) to enhance the quality of learned representations by exerting\ncontrastive regularization. During the online serving, to locate the target\naudiences more accurately, we propose Clustering-based Audience Targeting (CAT)\nthat clusters audience representations to acquire a few cluster centroids and\nthen locate the target audiences by measuring the relevance between the\naudience representations and the cluster centroids. Extensive experiments on\nthe offline dataset and online system demonstrate the effectiveness and\nefficiency of our method. Currently, we have deployed it on NetEase Cloud\nMusic, affecting millions of users. Code will be released in the future.\n","authors":["Xinping Zhao","Ying Zhang","Qiang Xiao","Yuming Ren","Yingchun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.02844v1.pdf","comment":"Accepted by WWW'2023"},{"id":"http://arxiv.org/abs/2307.02046v2","updated":"2023-08-05T05:49:13Z","published":"2023-07-05T06:03:40Z","title":"Recommender Systems in the Era of Large Language Models (LLMs)","summary":"  With the prosperity of e-commerce and web applications, Recommender Systems\n(RecSys) have become an important component of our daily life, providing\npersonalized suggestions that cater to user preferences. While Deep Neural\nNetworks (DNNs) have made significant advancements in enhancing recommender\nsystems by modeling user-item interactions and incorporating textual side\ninformation, DNN-based methods still face limitations, such as difficulties in\nunderstanding users' interests and capturing textual side information,\ninabilities in generalizing to various recommendation scenarios and reasoning\non their predictions, etc. Meanwhile, the emergence of Large Language Models\n(LLMs), such as ChatGPT and GPT4, has revolutionized the fields of Natural\nLanguage Processing (NLP) and Artificial Intelligence (AI), due to their\nremarkable abilities in fundamental responsibilities of language understanding\nand generation, as well as impressive generalization and reasoning\ncapabilities. As a result, recent studies have attempted to harness the power\nof LLMs to enhance recommender systems. Given the rapid evolution of this\nresearch direction in recommender systems, there is a pressing need for a\nsystematic overview that summarizes existing LLM-empowered recommender systems,\nto provide researchers in relevant fields with an in-depth understanding.\nTherefore, in this paper, we conduct a comprehensive review of LLM-empowered\nrecommender systems from various aspects including Pre-training, Fine-tuning,\nand Prompting. More specifically, we first introduce representative methods to\nharness the power of LLMs (as a feature encoder) for learning representations\nof users and items. Then, we review recent techniques of LLMs for enhancing\nrecommender systems from three paradigms, namely pre-training, fine-tuning, and\nprompting. Finally, we comprehensively discuss future directions in this\nemerging field.\n","authors":["Wenqi Fan","Zihuai Zhao","Jiatong Li","Yunqing Liu","Xiaowei Mei","Yiqi Wang","Zhen Wen","Fei Wang","Xiangyu Zhao","Jiliang Tang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2307.02046v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2006.08055v2","updated":"2023-08-05T18:46:16Z","published":"2020-06-14T23:47:14Z","title":"Multi-Purchase Behavior: Modeling, Estimation and Optimization","summary":"  We study the problem of modeling purchase of multiple products and utilizing\nit to display optimized recommendations for online retailers and e-commerce\nplatforms.\n  We present a parsimonious multi-purchase family of choice models called the\nBundle-MVL-K family, and develop a binary search based iterative strategy that\nefficiently computes optimized recommendations for this model. We establish the\nhardness of computing optimal recommendation sets, and derive several\nstructural properties of the optimal solution that aid in speeding up\ncomputation. This is one of the first attempts at operationalizing\nmulti-purchase class of choice models. We show one of the first quantitative\nlinks between modeling multiple purchase behavior and revenue gains. The\nefficacy of our modeling and optimization techniques compared to competing\nsolutions is shown using several real world datasets on multiple metrics such\nas model fitness, expected revenue gains and run-time reductions. For example,\nthe expected revenue benefit of taking multiple purchases into account is\nobserved to be $\\sim5\\%$ in relative terms for the Ta Feng and UCI shopping\ndatasets, when compared to the MNL model for instances with $\\sim 1500$\nproducts. Additionally, across $6$ real world datasets, the test log-likelihood\nfits of our models are on average $17\\%$ better in relative terms. Our work\ncontributes to the study multi-purchase decisions, analyzing consumer demand\nand the retailers optimization problem. The simplicity of our models and the\niterative nature of our optimization technique allows practitioners meet\nstringent computational constraints while increasing their revenues in\npractical recommendation applications at scale, especially in e-commerce\nplatforms and other marketplaces.\n","authors":["Theja Tulabandhula","Deeksha Sinha","Saketh Reddy Karra","Prasoon Patidar"],"pdf_url":"https://arxiv.org/pdf/2006.08055v2.pdf","comment":"48 pages. Published in Manufacturing & Service Operations Management\n  2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2303.17550v3","updated":"2023-08-05T17:26:48Z","published":"2023-03-30T17:18:31Z","title":"DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with\n  Diffusion Autoencoder","summary":"  While recent research has made significant progress in speech-driven talking\nface generation, the quality of the generated video still lags behind that of\nreal recordings. One reason for this is the use of handcrafted intermediate\nrepresentations like facial landmarks and 3DMM coefficients, which are designed\nbased on human knowledge and are insufficient to precisely describe facial\nmovements. Additionally, these methods require an external pretrained model for\nextracting these representations, whose performance sets an upper bound on\ntalking face generation. To address these limitations, we propose a novel\nmethod called DAE-Talker that leverages data-driven latent representations\nobtained from a diffusion autoencoder (DAE). DAE contains an image encoder that\nencodes an image into a latent vector and a DDIM image decoder that\nreconstructs the image from it. We train our DAE on talking face video frames\nand then extract their latent representations as the training target for a\nConformer-based speech2latent model. This allows DAE-Talker to synthesize full\nvideo frames and produce natural head movements that align with the content of\nspeech, rather than relying on a predetermined head pose from a template video.\nWe also introduce pose modelling in speech2latent for pose controllability.\nAdditionally, we propose a novel method for generating continuous video frames\nwith the DDIM image decoder trained on individual frames, eliminating the need\nfor modelling the joint distribution of consecutive frames directly. Our\nexperiments show that DAE-Talker outperforms existing popular methods in\nlip-sync, video fidelity, and pose naturalness. We also conduct ablation\nstudies to analyze the effectiveness of the proposed techniques and demonstrate\nthe pose controllability of DAE-Talker.\n","authors":["Chenpng Du","Qi Chen","Tianyu He","Xu Tan","Xie Chen","Kai Yu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.17550v3.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.02905v1","updated":"2023-08-05T15:54:06Z","published":"2023-08-05T15:54:06Z","title":"FAST: Font-Agnostic Scene Text Editing","summary":"  Scene Text Editing (STE) is a challenging research problem, and it aims to\nmodify existing texts in an image while preserving the background and the font\nstyle of the original text of the image. Due to its various real-life\napplications, researchers have explored several approaches toward STE in recent\nyears. However, most of the existing STE methods show inferior editing\nperformance because of (1) complex image backgrounds, (2) various font styles,\nand (3) varying word lengths within the text. To address such inferior editing\nperformance issues, in this paper, we propose a novel font-agnostic scene text\nediting framework, named FAST, for simultaneously generating text in arbitrary\nstyles and locations while preserving a natural and realistic appearance\nthrough combined mask generation and style transfer. The proposed approach\ndiffers from the existing methods as they directly modify all image pixels.\nInstead, the proposed method has introduced a filtering mechanism to remove\nbackground distractions, allowing the network to focus solely on the text\nregions where editing is required. Additionally, a text-style transfer module\nhas been designed to mitigate the challenges posed by varying word lengths.\nExtensive experiments and ablations have been conducted, and the results\ndemonstrate that the proposed method outperforms the existing methods both\nqualitatively and quantitatively.\n","authors":["Alloy Das","Prasun Roy","Saumik Bhattacharya","Subhankar Ghosh","Umapada Pal","Michael Blumenstein"],"pdf_url":"https://arxiv.org/pdf/2308.02905v1.pdf","comment":"13 pages, in submission"},{"id":"http://arxiv.org/abs/2308.02874v1","updated":"2023-08-05T13:10:43Z","published":"2023-08-05T13:10:43Z","title":"Sketch and Text Guided Diffusion Model for Colored Point Cloud\n  Generation","summary":"  Diffusion probabilistic models have achieved remarkable success in text\nguided image generation. However, generating 3D shapes is still challenging due\nto the lack of sufficient data containing 3D models along with their\ndescriptions. Moreover, text based descriptions of 3D shapes are inherently\nambiguous and lack details. In this paper, we propose a sketch and text guided\nprobabilistic diffusion model for colored point cloud generation that\nconditions the denoising process jointly with a hand drawn sketch of the object\nand its textual description. We incrementally diffuse the point coordinates and\ncolor values in a joint diffusion process to reach a Gaussian distribution.\nColored point cloud generation thus amounts to learning the reverse diffusion\nprocess, conditioned by the sketch and text, to iteratively recover the desired\nshape and color. Specifically, to learn effective sketch-text embedding, our\nmodel adaptively aggregates the joint embedding of text prompt and the sketch\nbased on a capsule attention network. Our model uses staged diffusion to\ngenerate the shape and then assign colors to different parts conditioned on the\nappearance prompt while preserving precise shapes from the first stage. This\ngives our model the flexibility to extend to multiple tasks, such as appearance\nre-editing and part segmentation. Experimental results demonstrate that our\nmodel outperforms recent state-of-the-art in point cloud generation.\n","authors":["Zijie Wu","Yaonan Wang","Mingtao Feng","He Xie","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2308.02874v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02816v1","updated":"2023-08-05T08:12:34Z","published":"2023-08-05T08:12:34Z","title":"PromptCARE: Prompt Copyright Protection by Watermark Injection and\n  Verification","summary":"  Large language models (LLMs) have witnessed a meteoric rise in popularity\namong the general public users over the past few months, facilitating diverse\ndownstream tasks with human-level accuracy and proficiency. Prompts play an\nessential role in this success, which efficiently adapt pre-trained LLMs to\ntask-specific applications by simply prepending a sequence of tokens to the\nquery texts. However, designing and selecting an optimal prompt can be both\nexpensive and demanding, leading to the emergence of Prompt-as-a-Service\nproviders who profit by providing well-designed prompts for authorized use.\nWith the growing popularity of prompts and their indispensable role in\nLLM-based services, there is an urgent need to protect the copyright of prompts\nagainst unauthorized use.\n  In this paper, we propose PromptCARE, the first framework for prompt\ncopyright protection through watermark injection and verification. Prompt\nwatermarking presents unique challenges that render existing watermarking\ntechniques developed for model and dataset copyright verification ineffective.\nPromptCARE overcomes these hurdles by proposing watermark injection and\nverification schemes tailor-made for prompts and NLP characteristics. Extensive\nexperiments on six well-known benchmark datasets, using three prevalent\npre-trained LLMs (BERT, RoBERTa, and Facebook OPT-1.3b), demonstrate the\neffectiveness, harmlessness, robustness, and stealthiness of PromptCARE.\n","authors":["Hongwei Yao","Jian Lou","Kui Ren","Zhan Qin"],"pdf_url":"https://arxiv.org/pdf/2308.02816v1.pdf","comment":null}]},"2023-08-08T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.04430v1","updated":"2023-08-08T17:58:15Z","published":"2023-08-08T17:58:15Z","title":"SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore","summary":"  The legality of training language models (LMs) on copyrighted or otherwise\nrestricted data is under intense debate. However, as we show, model performance\nsignificantly degrades if trained only on low-risk text (e.g., out-of-copyright\nbooks or government documents), due to its limited size and domain coverage. We\npresent SILO, a new language model that manages this risk-performance tradeoff\nduring inference. SILO is built by (1) training a parametric LM on Open License\nCorpus (OLC), a new corpus we curate with 228B tokens of public domain and\npermissively licensed text and (2) augmenting it with a more general and easily\nmodifiable nonparametric datastore (e.g., containing copyrighted books or news)\nthat is only queried during inference. The datastore allows use of high-risk\ndata without training on it, supports sentence-level data attribution, and\nenables data producers to opt out from the model by removing content from the\nstore. These capabilities can foster compliance with data-use regulations such\nas the fair use doctrine in the United States and the GDPR in the European\nUnion. Our experiments show that the parametric LM struggles on domains not\ncovered by OLC. However, access to the datastore greatly improves out of domain\nperformance, closing 90% of the performance gap with an LM trained on the Pile,\na more diverse corpus with mostly high-risk text. We also analyze which\nnonparametric approach works best, where the remaining errors lie, and how\nperformance scales with datastore size. Our results suggest that it is possible\nto build high quality language models while mitigating their legal risk.\n","authors":["Sewon Min","Suchin Gururangan","Eric Wallace","Hannaneh Hajishirzi","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2308.04430v1.pdf","comment":"27 pages; 6 figures. Code, models, and data available at\n  https://github.com/kernelmachine/silo-lm"},{"id":"http://arxiv.org/abs/2308.04424v1","updated":"2023-08-08T17:53:24Z","published":"2023-08-08T17:53:24Z","title":"A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment\n  Classification and Act Recognition","summary":"  The joint task of Dialog Sentiment Classification (DSC) and Act Recognition\n(DAR) aims to predict the sentiment label and act label for each utterance in a\ndialog simultaneously. However, current methods encode the dialog context in\nonly one direction, which limits their ability to thoroughly comprehend the\ncontext. Moreover, these methods overlook the explicit correlations between\nsentiment and act labels, which leads to an insufficient ability to capture\nrich sentiment and act clues and hinders effective and accurate reasoning. To\naddress these issues, we propose a Bi-directional Multi-hop Inference Model\n(BMIM) that leverages a feature selection network and a bi-directional\nmulti-hop inference network to iteratively extract and integrate rich sentiment\nand act clues in a bi-directional manner. We also employ contrastive learning\nand dual learning to explicitly model the correlations of sentiment and act\nlabels. Our experiments on two widely-used datasets show that BMIM outperforms\nstate-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1\nscore in DSC. Additionally, Our proposed model not only improves the\nperformance but also enhances the interpretability of the joint sentiment and\nact prediction task.\n","authors":["Li Zheng","Fei Li","Yuyang Chai","Chong Teng","Donghong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15002v5","updated":"2023-08-08T17:39:57Z","published":"2023-07-27T16:57:32Z","title":"Gzip versus bag-of-words for text classification","summary":"  The effectiveness of compression in text classification ('gzip') has recently\ngarnered lots of attention. In this note we show that `bag-of-words' approaches\ncan achieve similar or better results, and are more efficient.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2307.15002v5.pdf","comment":"improved writing, extended with more results"},{"id":"http://arxiv.org/abs/2308.04398v1","updated":"2023-08-08T17:01:42Z","published":"2023-08-08T17:01:42Z","title":"Character-level NMT and language similarity","summary":"  We explore the effectiveness of character-level neural machine translation\nusing Transformer architecture for various levels of language similarity and\nsize of the training dataset on translation between Czech and Croatian, German,\nHungarian, Slovak, and Spanish. We evaluate the models using automatic MT\nmetrics and show that translation between similar languages benefits from\ncharacter-level input segmentation, while for less related languages,\ncharacter-level vanilla Transformer-base often lags behind subword-level\nsegmentation. We confirm previous findings that it is possible to close the gap\nby finetuning the already trained subword-level models to character-level.\n","authors":["Josef Jon","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2308.04398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04386v1","updated":"2023-08-08T16:41:16Z","published":"2023-08-08T16:41:16Z","title":"Learning Evaluation Models from Large Language Models for Sequence\n  Generation","summary":"  Large language models achieve state-of-the-art performance on sequence\ngeneration evaluation, but typically have a large number of parameters. This is\na computational challenge as presented by applying their evaluation capability\nat scale. To overcome the challenge, in this paper, we propose \\textbf{ECT}, an\n\\textbf{e}valuation \\textbf{c}apability \\textbf{t}ransfer method, to transfer\nthe evaluation capability from LLMs to relatively lightweight language models.\nBased on the proposed ECT, we learn various evaluation models from ChatGPT, and\nemploy them as reward models to improve sequence generation models via\nreinforcement learning and reranking approaches. Experimental results on\nmachine translation, text style transfer, and summarization tasks demonstrate\nthe effectiveness of our ECT. Notably, applying the learned evaluation models\nto sequence generation models results in better generated sequences as\nevaluated by commonly used metrics and ChatGPT.\n","authors":["Chenglong Wang","Hang Zhou","Kaiyan Chang","Tongran Liu","Chunliang Zhang","Quan Du","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.04386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06713v2","updated":"2023-08-08T16:21:49Z","published":"2023-07-13T12:11:36Z","title":"Unsupervised Calibration through Prior Adaptation for Text\n  Classification using Large Language Models","summary":"  A wide variety of natural language tasks are currently being addressed with\nlarge-scale language models (LLMs). These models are usually trained with a\nvery large amount of unsupervised text data and adapted to perform a downstream\nnatural language task using methods like fine-tuning, calibration or in-context\nlearning. In this work, we propose an approach to adapt the prior class\ndistribution to perform text classification tasks without the need for labelled\nsamples and only few in-domain sample queries. The proposed approach treats the\nLLM as a black box, adding a stage where the model posteriors are calibrated to\nthe task. Results show that these methods outperform the un-adapted model for\ndifferent number of training shots in the prompt and a previous approach were\ncalibration is performed without using any adaptation data.\n","authors":["Lautaro Estienne"],"pdf_url":"https://arxiv.org/pdf/2307.06713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04346v1","updated":"2023-08-08T15:46:27Z","published":"2023-08-08T15:46:27Z","title":"Unmasking Nationality Bias: A Study of Human Perception of Nationalities\n  in AI-Generated Articles","summary":"  We investigate the potential for nationality biases in natural language\nprocessing (NLP) models using human evaluation methods. Biased NLP models can\nperpetuate stereotypes and lead to algorithmic discrimination, posing a\nsignificant challenge to the fairness and justice of AI systems. Our study\nemploys a two-step mixed-methods approach that includes both quantitative and\nqualitative analysis to identify and understand the impact of nationality bias\nin a text generation model. Through our human-centered quantitative analysis,\nwe measure the extent of nationality bias in articles generated by AI sources.\nWe then conduct open-ended interviews with participants, performing qualitative\ncoding and thematic analysis to understand the implications of these biases on\nhuman readers. Our findings reveal that biased NLP models tend to replicate and\namplify existing societal biases, which can translate to harm if used in a\nsociotechnical setting. The qualitative analysis from our interviews offers\ninsights into the experience readers have when encountering such articles,\nhighlighting the potential to shift a reader's perception of a country. These\nfindings emphasize the critical role of public perception in shaping AI's\nimpact on society and the need to correct biases in AI systems.\n","authors":["Pranav Narayanan Venkit","Sanjana Gautam","Ruchi Panchanadikar","Ting-Hao `Kenneth' Huang","Shomir Wilson"],"pdf_url":"https://arxiv.org/pdf/2308.04346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03629v2","updated":"2023-08-08T15:38:21Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v2.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2308.04333v1","updated":"2023-08-08T15:26:58Z","published":"2023-08-08T15:26:58Z","title":"Towards an AI to Win Ghana's National Science and Maths Quiz","summary":"  Can an AI win Ghana's National Science and Maths Quiz (NSMQ)? That is the\nquestion we seek to answer in the NSMQ AI project, an open-source project that\nis building AI to compete live in the NSMQ and win. The NSMQ is an annual live\nscience and mathematics competition for senior secondary school students in\nGhana in which 3 teams of 2 students compete by answering questions across\nbiology, chemistry, physics, and math in 5 rounds over 5 progressive stages\nuntil a winning team is crowned for that year. The NSMQ is an exciting live\nquiz competition with interesting technical challenges across speech-to-text,\ntext-to-speech, question-answering, and human-computer interaction. In this\nongoing work that began in January 2023, we give an overview of the project,\ndescribe each of the teams, progress made thus far, and the next steps toward\nour planned launch and debut of the AI in October for NSMQ 2023. An AI that\nconquers this grand challenge can have real-world impact on education such as\nenabling millions of students across Africa to have one-on-one learning support\nfrom this AI.\n","authors":["George Boateng","Jonathan Abrefah Mensah","Kevin Takyi Yeboah","William Edor","Andrew Kojo Mensah-Onumah","Naafi Dasana Ibrahim","Nana Sam Yeboah"],"pdf_url":"https://arxiv.org/pdf/2308.04333v1.pdf","comment":"7 pages. Under review at Deep Learning Indaba and Black in AI\n  Workshop @NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.04306v1","updated":"2023-08-08T14:51:16Z","published":"2023-08-08T14:51:16Z","title":"Deep Learning-Based Knowledge Injection for Metaphor Detection: A\n  Comprehensive Review","summary":"  The history of metaphor research also marks the evolution of knowledge\ninfusion research. With the continued advancement of deep learning techniques\nin recent years, the natural language processing community has shown great\ninterest in applying knowledge to successful results in metaphor recognition\ntasks. Although there has been a gradual increase in the number of approaches\ninvolving knowledge injection in the field of metaphor recognition, there is a\nlack of a complete review article on knowledge injection based approaches.\nTherefore, the goal of this paper is to provide a comprehensive review of\nresearch advances in the application of deep learning for knowledge injection\nin metaphor recognition tasks. In this paper, we systematically summarize and\ngeneralize the mainstream knowledge and knowledge injection principles, as well\nas review the datasets, evaluation metrics, and benchmark models used in\nmetaphor recognition tasks. Finally, we explore the current issues facing\nknowledge injection methods and provide an outlook on future research\ndirections.\n","authors":["Cheng Yang","Wenye Zhao","Qingbao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04306v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.04286v1","updated":"2023-08-08T14:29:35Z","published":"2023-08-08T14:29:35Z","title":"Comparative Analysis of the wav2vec 2.0 Feature Extractor","summary":"  Automatic speech recognition (ASR) systems typically use handcrafted feature\nextraction pipelines. To avoid their inherent information loss and to achieve\nmore consistent modeling from speech to transcribed text, neural raw waveform\nfeature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,\nwhich has recently gained large popularity, uses a convolutional FE which\noperates directly on the speech waveform. However, it is not yet studied\nextensively in the literature. In this work, we study its capability to replace\nthe standard feature extraction methods in a connectionist temporal\nclassification (CTC) ASR model and compare it to an alternative neural FE. We\nshow that both are competitive with traditional FEs on the LibriSpeech\nbenchmark and analyze the effect of the individual components. Furthermore, we\nanalyze the learned filters and show that the most important information for\nthe ASR system is obtained by a set of bandpass filters.\n","authors":["Peter Vieting","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2308.04286v1.pdf","comment":"Accepted at ITG 2023"},{"id":"http://arxiv.org/abs/2308.04275v1","updated":"2023-08-08T14:17:17Z","published":"2023-08-08T14:17:17Z","title":"In-Context Alignment: Chat with Vanilla Language Models Before\n  Fine-Tuning","summary":"  In this note, we explore inference-time alignment through in-context\nlearning. We consider a vanilla pretrained language model Llama-2 before any\nfine-tuning and retrieve an average of 9 demonstration alignment examples when\nthe model is prompted to follow chat-style instructions. Compared to direct\nprompting, the in-context alignment without changing model weights leads to a\n7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making\nthe vanilla language model comparable to strong baselines with alignment\nfine-tuning.\n","authors":["Xiaochuang Han"],"pdf_url":"https://arxiv.org/pdf/2308.04275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11661v2","updated":"2023-08-08T13:44:12Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. The code, prompts, and auxiliary text dataset is\navailable at https://github.com/mayug/VDT-Adapter.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v2.pdf","comment":"Paper accepted at ICCV-W 2023. V2 contains additional comparisons\n  with concurrent works"},{"id":"http://arxiv.org/abs/2308.04255v1","updated":"2023-08-08T13:41:41Z","published":"2023-08-08T13:41:41Z","title":"CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic\n  Languages","summary":"  We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of\nthe South Slavic languages, which is based on the Stanza natural language\nprocessing pipeline. We describe the main improvements in CLASSLA-Stanza with\nrespect to Stanza, and give a detailed description of the model training\nprocess for the latest 2.1 release of the pipeline. We also report performance\nscores produced by the pipeline for different languages and varieties.\nCLASSLA-Stanza exhibits consistently high performance across all the supported\nlanguages and outperforms or expands its parent pipeline Stanza at all the\nsupported tasks. We also present the pipeline's new functionality enabling\nefficient processing of web data and the reasons that led to its\nimplementation.\n","authors":["Luka Terčon","Nikola Ljubešić"],"pdf_url":"https://arxiv.org/pdf/2308.04255v1.pdf","comment":"17 pages, 14 tables, 1 figure"},{"id":"http://arxiv.org/abs/2302.03512v3","updated":"2023-08-08T13:27:29Z","published":"2023-02-07T14:56:52Z","title":"A Survey on Arabic Named Entity Recognition: Past, Recent Advances, and\n  Future Trends","summary":"  As more and more Arabic texts emerged on the Internet, extracting important\ninformation from these Arabic texts is especially useful. As a fundamental\ntechnology, Named entity recognition (NER) serves as the core component in\ninformation extraction technology, while also playing a critical role in many\nother Natural Language Processing (NLP) systems, such as question answering and\nknowledge graph building. In this paper, we provide a comprehensive review of\nthe development of Arabic NER, especially the recent advances in deep learning\nand pre-trained language model. Specifically, we first introduce the background\nof Arabic NER, including the characteristics of Arabic and existing resources\nfor Arabic NER. Then, we systematically review the development of Arabic NER\nmethods. Traditional Arabic NER systems focus on feature engineering and\ndesigning domain-specific rules. In recent years, deep learning methods achieve\nsignificant progress by representing texts via continuous vector\nrepresentations. With the growth of pre-trained language model, Arabic NER\nyields better performance. Finally, we conclude the method gap between Arabic\nNER and NER methods from other languages, which helps outline future directions\nfor Arabic NER.\n","authors":["Xiaoye Qu","Yingjie Gu","Qingrong Xia","Zechang Li","Zhefeng Wang","Baoxing Huai"],"pdf_url":"https://arxiv.org/pdf/2302.03512v3.pdf","comment":"Accepted by IEEE TKDE"},{"id":"http://arxiv.org/abs/2308.04248v1","updated":"2023-08-08T13:26:53Z","published":"2023-08-08T13:26:53Z","title":"Gloss Alignment Using Word Embeddings","summary":"  Capturing and annotating Sign language datasets is a time consuming and\ncostly process. Current datasets are orders of magnitude too small to\nsuccessfully train unconstrained \\acf{slt} models. As a result, research has\nturned to TV broadcast content as a source of large-scale training data,\nconsisting of both the sign language interpreter and the associated audio\nsubtitle. However, lack of sign language annotation limits the usability of\nthis data and has led to the development of automatic annotation techniques\nsuch as sign spotting. These spottings are aligned to the video rather than the\nsubtitle, which often results in a misalignment between the subtitle and\nspotted signs. In this paper we propose a method for aligning spottings with\ntheir corresponding subtitles using large spoken language models. Using a\nsingle modality means our method is computationally inexpensive and can be\nutilized in conjunction with existing alignment techniques. We quantitatively\ndemonstrate the effectiveness of our method on the \\acf{mdgs} and \\acf{bobsl}\ndatasets, recovering up to a 33.22 BLEU-1 score in word alignment.\n","authors":["Harry Walsh","Ozge Mercanoglu Sincan","Ben Saunders","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2308.04248v1.pdf","comment":"4 pages, 4 figures, 2023 IEEE International Conference on Acoustics,\n  Speech, and Signal Processing Workshops (ICASSPW)"},{"id":"http://arxiv.org/abs/2306.09841v3","updated":"2023-08-08T12:57:18Z","published":"2023-06-16T13:39:35Z","title":"Are Large Language Models Really Good Logical Reasoners? A Comprehensive\n  Evaluation and Beyond","summary":"  Logical reasoning consistently plays a fundamental and significant role in\nthe domains of knowledge engineering and artificial intelligence. Recently,\nLarge Language Models (LLMs) have emerged as a noteworthy innovation in natural\nlanguage processing (NLP), exhibiting impressive achievements across various\nclassic NLP tasks. However, the question of whether LLMs can effectively\naddress the task of logical reasoning, which requires gradual cognitive\ninference similar to human intelligence, remains unanswered. To this end, we\naim to bridge this gap and provide comprehensive evaluations in this paper.\nFirstly, to offer systematic evaluations, we select fifteen typical logical\nreasoning datasets and organize them into deductive, inductive, abductive and\nmixed-form reasoning settings. Considering the comprehensiveness of\nevaluations, we include three representative LLMs (i.e., text-davinci-003,\nChatGPT and BARD) and evaluate them on all selected datasets under zero-shot,\none-shot and three-shot settings. Secondly, different from previous evaluations\nrelying only on simple metrics (e.g., accuracy), we propose fine-level\nevaluations from objective and subjective manners, covering both answers and\nexplanations. Additionally, to uncover the logical flaws of LLMs, problematic\ncases will be attributed to five error types from two dimensions, i.e.,\nevidence selection process and reasoning process. Thirdly, to avoid the\ninfluences of knowledge bias and purely focus on benchmarking the logical\nreasoning capability of LLMs, we propose a new dataset with neutral content. It\ncontains 3,000 samples and covers deductive, inductive and abductive settings.\nBased on the in-depth evaluations, this paper finally forms a general\nevaluation scheme of logical reasoning capability from six dimensions. It\nreflects the pros and cons of LLMs and gives guiding directions for future\nworks.\n","authors":["Fangzhi Xu","Qika Lin","Jiawei Han","Tianzhe Zhao","Jun Liu","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2306.09841v3.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.04215v1","updated":"2023-08-08T12:27:20Z","published":"2023-08-08T12:27:20Z","title":"Hybrid Retrieval-Augmented Generation for Real-time Composition\n  Assistance","summary":"  Retrieval augmented models show promise in enhancing traditional language\nmodels by improving their contextual understanding, integrating private data,\nand reducing hallucination. However, the processing time required for retrieval\naugmented large language models poses a challenge when applying them to tasks\nthat require real-time responses, such as composition assistance.\n  To overcome this limitation, we propose the Hybrid Retrieval-Augmented\nGeneration (HybridRAG) framework that leverages a hybrid setting that combines\nboth client and cloud models. HybridRAG incorporates retrieval-augmented memory\ngenerated asynchronously by a Large Language Model (LLM) in the cloud. By\nintegrating this retrieval augmented memory, the client model acquires the\ncapability to generate highly effective responses, benefiting from the LLM's\ncapabilities. Furthermore, through asynchronous memory integration, the client\nmodel is capable of delivering real-time responses to user requests without the\nneed to wait for memory synchronization from the cloud. Our experiments on\nWikitext and Pile subsets show that HybridRAG achieves lower latency than a\ncloud-based retrieval-augmented LLM, while outperforming client-only models in\nutility.\n","authors":["Xuchao Zhang","Menglin Xia","Camille Couturier","Guoqing Zheng","Saravan Rajmohan","Victor Ruhle"],"pdf_url":"https://arxiv.org/pdf/2308.04215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09998v3","updated":"2023-08-08T12:23:49Z","published":"2023-07-19T14:13:02Z","title":"Generating Mathematical Derivations with Large Language Models","summary":"  The derivation of mathematical results in specialised fields, using Large\nLanguage Models (LLMs), is an emerging research direction that can help\nidentify models' limitations, and potentially support mathematical discovery.\nIn this paper, we leverage a symbolic engine to generate derivations of\nequations at scale, and investigate the capabilities of LLMs when deriving goal\nequations from premises. Specifically, we employ in-context learning for GPT\nand fine-tune a range of T5 models to compare the robustness and generalisation\nof pre-training strategies to specialised models. Empirical results show that\nfine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and\nout-of-distribution test sets in conventional scores. However, an in-depth\nanalysis reveals that the fine-tuned models are more sensitive to perturbations\ninvolving unseen symbols and (to a lesser extent) changes to equation\nstructure. In addition, we analyse 1.7K equations, and over 200 derivations, to\nhighlight common reasoning errors such as the inclusion of incorrect,\nirrelevant, and redundant equations. Finally, we explore the suitability of\nexisting metrics for evaluating mathematical derivations and find evidence\nthat, while they can capture general properties such as sensitivity to\nperturbations, they fail to highlight fine-grained reasoning errors and\nessential differences between models. Overall, this work demonstrates that\ntraining models on synthetic data may improve their math capabilities beyond\nmuch larger LLMs, but current metrics are not appropriately assessing the\nquality of generated mathematical text.\n","authors":["Jordan Meadows","Marco Valentino","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2307.09998v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.03565v2","updated":"2023-08-08T12:12:55Z","published":"2023-08-07T13:16:42Z","title":"Topological Interpretations of GPT-3","summary":"  This is an experiential study of investigating a consistent method for\nderiving the correlation between sentence vector and semantic meaning of a\nsentence. We first used three state-of-the-art word/sentence embedding methods\nincluding GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence\nstrings into high dimensional spaces. Then we compute the pairwise distance\nbetween any possible combination of two sentence vectors in an embedding space\nand map them into a matrix. Based on each distance matrix, we compute the\ncorrelation of distances of a sentence vector with respect to the other\nsentence vectors in an embedding space. Then we compute the correlation of each\npair of the distance matrices. We observed correlations of the same sentence in\ndifferent embedding spaces and correlations of different sentences in the same\nembedding space. These observations are consistent with our hypothesis and take\nus to the next stage.\n","authors":["Tianyi Sun","Bradley Nelson"],"pdf_url":"https://arxiv.org/pdf/2308.03565v2.pdf","comment":"70 pages"},{"id":"http://arxiv.org/abs/2305.10652v2","updated":"2023-08-08T11:10:32Z","published":"2023-05-18T02:19:05Z","title":"Speech Separation based on Contrastive Learning and Deep Modularization","summary":"  The current monaural state of the art tools for speech separation relies on\nsupervised learning. This means that they must deal with permutation problem,\nthey are impacted by the mismatch on the number of speakers used in training\nand inference. Moreover, their performance heavily relies on the presence of\nhigh-quality labelled data. These problems can be effectively addressed by\nemploying a fully unsupervised technique for speech separation. In this paper,\nwe use contrastive learning to establish the representations of frames then use\nthe learned representations in the downstream deep modularization task.\nConcretely, we demonstrate experimentally that in speech separation, different\nframes of a speaker can be viewed as augmentations of a given hidden standard\nframe of that speaker. The frames of a speaker contain enough prosodic\ninformation overlap which is key in speech separation. Based on this, we\nimplement a self-supervised learning to learn to minimize the distance between\nframes belonging to a given speaker. The learned representations are used in a\ndownstream deep modularization task to cluster frames based on speaker\nidentity. Evaluation of the developed technique on WSJ0-2mix and WSJ0-3mix\nshows that the technique attains SI-SNRi and SDRi of 20.8 and 21.0 respectively\nin WSJ0-2mix. In WSJ0-3mix, it attains SI-SNRi and SDRi of 20.7 and 20.7\nrespectively in WSJ0-2mix. Its greatest strength being that as the number of\nspeakers increase, its performance does not degrade significantly.\n","authors":["Peter Ochieng"],"pdf_url":"https://arxiv.org/pdf/2305.10652v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2212.00369"},{"id":"http://arxiv.org/abs/2308.04180v1","updated":"2023-08-08T10:42:33Z","published":"2023-08-08T10:42:33Z","title":"Studying Socially Unacceptable Discourse Classification (SUD) through\n  different eyes: \"Are we on the same page ?\"","summary":"  We study Socially Unacceptable Discourse (SUD) characterization and detection\nin online text. We first build and present a novel corpus that contains a large\nvariety of manually annotated texts from different online sources used so far\nin state-of-the-art Machine learning (ML) SUD detection solutions. This global\ncontext allows us to test the generalization ability of SUD classifiers that\nacquire knowledge around the same SUD categories, but from different contexts.\nFrom this perspective, we can analyze how (possibly) different annotation\nmodalities influence SUD learning by discussing open challenges and open\nresearch directions. We also provide several data insights which can support\ndomain experts in the annotation task.\n","authors":["Bruno Machado Carneiro","Michele Linardi","Julien Longhi"],"pdf_url":"https://arxiv.org/pdf/2308.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04176v1","updated":"2023-08-08T10:23:04Z","published":"2023-08-08T10:23:04Z","title":"On Monotonic Aggregation for Open-domain QA","summary":"  Question answering (QA) is a critical task for speech-based retrieval from\nknowledge sources, by sifting only the answers without requiring to read\nsupporting documents. Specifically, open-domain QA aims to answer user\nquestions on unrestricted knowledge sources. Ideally, adding a source should\nnot decrease the accuracy, but we find this property (denoted as\n\"monotonicity\") does not hold for current state-of-the-art methods. We identify\nthe cause, and based on that we propose Judge-Specialist framework. Our\nframework consists of (1) specialist retrievers/readers to cover individual\nsources, and (2) judge, a dedicated language model to select the final answer.\nOur experiments show that our framework not only ensures monotonicity, but also\noutperforms state-of-the-art multi-source QA methods on Natural Questions.\nAdditionally, we show that our models robustly preserve the monotonicity\nagainst noise from speech recognition. We publicly release our code and\nsetting.\n","authors":["Sang-eun Han","Yeonseok Jeong","Seung-won Hwang","Kyungjae Lee"],"pdf_url":"https://arxiv.org/pdf/2308.04176v1.pdf","comment":"INTERSPEECH 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2306.02864v2","updated":"2023-08-08T09:48:36Z","published":"2023-06-05T13:35:01Z","title":"Leveraging Large Language Models for Topic Classification in the Domain\n  of Public Affairs","summary":"  The analysis of public affairs documents is crucial for citizens as it\npromotes transparency, accountability, and informed decision-making. It allows\ncitizens to understand government policies, participate in public discourse,\nand hold representatives accountable. This is crucial, and sometimes a matter\nof life or death, for companies whose operation depend on certain regulations.\nLarge Language Models (LLMs) have the potential to greatly enhance the analysis\nof public affairs documents by effectively processing and understanding the\ncomplex language used in such documents. In this work, we analyze the\nperformance of LLMs in classifying public affairs documents. As a natural\nmulti-label task, the classification of these documents presents important\nchallenges. In this work, we use a regex-powered tool to collect a database of\npublic affairs documents with more than 33K samples and 22.5M tokens. Our\nexperiments assess the performance of 4 different Spanish LLMs to classify up\nto 30 different topics in the data in different configurations. The results\nshows that LLMs can be of great use to process domain-specific documents, such\nas those in the domain of public affairs.\n","authors":["Alejandro Peña","Aythami Morales","Julian Fierrez","Ignacio Serna","Javier Ortega-Garcia","Iñigo Puente","Jorge Cordova","Gonzalo Cordova"],"pdf_url":"https://arxiv.org/pdf/2306.02864v2.pdf","comment":"Accepted in ICDAR 2023 Workshop on Automatic Domain-Adapted and\n  Personalized Document Analysis"},{"id":"http://arxiv.org/abs/2308.02582v2","updated":"2023-08-08T08:57:20Z","published":"2023-08-01T05:31:36Z","title":"Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain\n  Adapted Least-To-Most Prompting","summary":"  Cross-domain and cross-compositional generalization of Text-to-SQL semantic\nparsing is a challenging task. Existing Large Language Model (LLM) based\nsolutions rely on inference-time retrieval of few-shot exemplars from the\ntraining set to synthesize a run-time prompt for each Natural Language (NL)\ntest query. In contrast, we devise an algorithm which performs offline sampling\nof a minimal set-of few-shots from the training data, with complete coverage of\nSQL clauses, operators and functions, and maximal domain coverage within the\nallowed token length. This allows for synthesis of a fixed Generic Prompt (GP),\nwith a diverse set-of exemplars common across NL test queries, avoiding\nexpensive test time exemplar retrieval. We further auto-adapt the GP to the\ntarget database domain (DA-GP), to better handle cross-domain generalization;\nfollowed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle\ncross-compositional generalization. The synthesis of LTMP-DA-GP is an offline\ntask, to be performed one-time per new database with minimal human\nintervention. Our approach demonstrates superior performance on the KaggleDBQA\ndataset, designed to evaluate generalizability for the Text-to-SQL task. We\nfurther showcase consistent performance improvement of LTMP-DA-GP over GP,\nacross LLMs and databases of KaggleDBQA, highlighting the efficacy and model\nagnostic benefits of our prompt based adapt and decompose approach.\n","authors":["Aseem Arora","Shabbirhussain Bhaisaheb","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2308.02582v2.pdf","comment":"22 Pages"},{"id":"http://arxiv.org/abs/2308.04138v1","updated":"2023-08-08T08:57:01Z","published":"2023-08-08T08:57:01Z","title":"Large Language Model Prompt Chaining for Long Legal Document\n  Classification","summary":"  Prompting is used to guide or steer a language model in generating an\nappropriate response that is consistent with the desired outcome. Chaining is a\nstrategy used to decompose complex tasks into smaller, manageable components.\nIn this study, we utilize prompt chaining for extensive legal document\nclassification tasks, which present difficulties due to their intricate\ndomain-specific language and considerable length. Our approach begins with the\ncreation of a concise summary of the original document, followed by a semantic\nsearch for related exemplar texts and their corresponding annotations from a\ntraining corpus. Finally, we prompt for a label - based on the task - to\nassign, by leveraging the in-context learning from the few-shot prompt. We\ndemonstrate that through prompt chaining, we can not only enhance the\nperformance over zero-shot, but also surpass the micro-F1 score achieved by\nlarger models, such as ChatGPT zero-shot, using smaller models.\n","authors":["Dietrich Trautmann"],"pdf_url":"https://arxiv.org/pdf/2308.04138v1.pdf","comment":"SwissText 2023 Late Breaking Work (Generative AI & LLM)"},{"id":"http://arxiv.org/abs/2308.04124v1","updated":"2023-08-08T08:27:57Z","published":"2023-08-08T08:27:57Z","title":"Social Media, Topic Modeling and Sentiment Analysis in Municipal\n  Decision Support","summary":"  Many cities around the world are aspiring to become. However, smart\ninitiatives often give little weight to the opinions of average citizens.\n  Social media are one of the most important sources of citizen opinions. This\npaper presents a prototype of a framework for processing social media posts\nwith municipal decision-making in mind. The framework consists of a sequence of\nthree steps: (1) determining the sentiment polarity of each social media post\n(2) identifying prevalent topics and mapping these topics to individual posts,\nand (3) aggregating these two pieces of information into a fuzzy number\nrepresenting the overall sentiment expressed towards each topic. Optionally,\nthe fuzzy number can be reduced into a tuple of two real numbers indicating the\n\"amount\" of positive and negative opinion expressed towards each topic.\n  The framework is demonstrated on tweets published from Ostrava, Czechia over\na period of about two months. This application illustrates how fuzzy numbers\nrepresent sentiment in a richer way and capture the diversity of opinions\nexpressed on social media.\n","authors":["Miloš Švaňa"],"pdf_url":"https://arxiv.org/pdf/2308.04124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07748v4","updated":"2023-08-08T08:08:12Z","published":"2023-02-15T15:54:01Z","title":"Whats New? Identifying the Unfolding of New Events in Narratives","summary":"  Narratives include a rich source of events unfolding over time and context.\nAutomatic understanding of these events provides a summarised comprehension of\nthe narrative for further computation (such as reasoning). In this paper, we\nstudy the Information Status (IS) of the events and propose a novel challenging\ntask: the automatic identification of new events in a narrative. We define an\nevent as a triplet of subject, predicate, and object. The event is categorized\nas new with respect to the discourse context and whether it can be inferred\nthrough commonsense reasoning. We annotated a publicly available corpus of\nnarratives with the new events at sentence level using human annotators. We\npresent the annotation protocol and study the quality of the annotation and the\ndifficulty of the task. We publish the annotated dataset, annotation materials,\nand machine learning baseline models for the task of new event extraction for\nnarrative understanding.\n","authors":["Seyed Mahed Mousavi","Shohei Tanaka","Gabriel Roccabruna","Koichiro Yoshino","Satoshi Nakamura","Giuseppe Riccardi"],"pdf_url":"https://arxiv.org/pdf/2302.07748v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04114v1","updated":"2023-08-08T08:00:52Z","published":"2023-08-08T08:00:52Z","title":"Collective Human Opinions in Semantic Textual Similarity","summary":"  Despite the subjective nature of semantic textual similarity (STS) and\npervasive disagreements in STS annotation, existing benchmarks have used\naveraged human ratings as the gold standard. Averaging masks the true\ndistribution of human opinions on examples of low agreement, and prevents\nmodels from capturing the semantic vagueness that the individual ratings\nrepresent. In this work, we introduce USTS, the first Uncertainty-aware STS\ndataset with ~15,000 Chinese sentence pairs and 150,000 labels, to study\ncollective human opinions in STS. Analysis reveals that neither a scalar nor a\nsingle Gaussian fits a set of observed judgements adequately. We further show\nthat current STS models cannot capture the variance caused by human\ndisagreement on individual instances, but rather reflect the predictive\nconfidence over the aggregate dataset.\n","authors":["Yuxia Wang","Shimin Tao","Ning Xie","Hao Yang","Timothy Baldwin","Karin Verspoor"],"pdf_url":"https://arxiv.org/pdf/2308.04114v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.03421v2","updated":"2023-08-08T07:58:06Z","published":"2023-08-07T09:14:33Z","title":"RecycleGPT: An Autoregressive Language Model with Recyclable Module","summary":"  Existing large language models have to run K times to generate a sequence of\nK tokens. In this paper, we present RecycleGPT, a generative language model\nwith fast decoding speed by recycling pre-generated model states without\nrunning the whole model in multiple steps. Our approach relies on the\nobservation that adjacent tokens in a sequence usually have strong correlations\nand the next token in a sequence can be reasonably guessed or inferred based on\nthe preceding ones. Experiments and analysis demonstrate the effectiveness of\nour approach in lowering inference latency, achieving up to 1.4x speedup while\npreserving high performance.\n","authors":["Yufan Jiang","Qiaozhi He","Xiaomin Zhuang","Zhihua Wu","Kunpeng Wang","Wenlai Zhao","Guangwen Yang"],"pdf_url":"https://arxiv.org/pdf/2308.03421v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.04109v1","updated":"2023-08-08T07:47:10Z","published":"2023-08-08T07:47:10Z","title":"I-WAS: a Data Augmentation Method with GPT-2 for Simile Detection","summary":"  Simile detection is a valuable task for many natural language processing\n(NLP)-based applications, particularly in the field of literature. However,\nexisting research on simile detection often relies on corpora that are limited\nin size and do not adequately represent the full range of simile forms. To\naddress this issue, we propose a simile data augmentation method based on\n\\textbf{W}ord replacement And Sentence completion using the GPT-2 language\nmodel. Our iterative process called I-WAS, is designed to improve the quality\nof the augmented sentences. To better evaluate the performance of our method in\nreal-world applications, we have compiled a corpus containing a more diverse\nset of simile forms for experimentation. Our experimental results demonstrate\nthe effectiveness of our proposed data augmentation method for simile\ndetection.\n","authors":["Yongzhu Chang","Rongsheng Zhang","Jiashu Pu"],"pdf_url":"https://arxiv.org/pdf/2308.04109v1.pdf","comment":"15 pages, 1 figure"},{"id":"http://arxiv.org/abs/2201.05337v4","updated":"2023-08-08T06:50:57Z","published":"2022-01-14T08:32:20Z","title":"A Survey of Controllable Text Generation using Transformer-based\n  Pre-trained Language Models","summary":"  Controllable Text Generation (CTG) is emerging area in the field of natural\nlanguage generation (NLG). It is regarded as crucial for the development of\nadvanced text generation technologies that better meet the specific constraints\nin practical applications. In recent years, methods using large-scale\npre-trained language models (PLMs), in particular the widely used\ntransformer-based PLMs, have become a new paradigm of NLG, allowing generation\nof more diverse and fluent text. However, due to the limited level of\ninterpretability of deep neural networks, the controllability of these methods\nneed to be guaranteed. To this end, controllable text generation using\ntransformer-based PLMs has become a rapidly growing yet challenging new\nresearch hotspot. A diverse range of approaches have emerged in the recent 3-4\nyears, targeting different CTG tasks that require different types of controlled\nconstraints. In this paper, we present a systematic critical review on the\ncommon tasks, main approaches, and evaluation methods in this area. Finally, we\ndiscuss the challenges that the field is facing, and put forward various\npromising future directions. To the best of our knowledge, this is the first\nsurvey paper to summarize the state-of-the-art CTG techniques from the\nperspective of Transformer-based PLMs. We hope it can help researchers and\npractitioners in the related fields to quickly track the academic and\ntechnological frontier, providing them with a landscape of the area and a\nroadmap for future research.\n","authors":["Hanqing Zhang","Haolin Song","Shaoyu Li","Ming Zhou","Dawei Song"],"pdf_url":"https://arxiv.org/pdf/2201.05337v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04076v1","updated":"2023-08-08T06:21:58Z","published":"2023-08-08T06:21:58Z","title":"DataTales: Investigating the use of Large Language Models for Authoring\n  Data-Driven Articles","summary":"  Authoring data-driven articles is a complex process requiring authors to not\nonly analyze data for insights but also craft a cohesive narrative that\neffectively communicates the insights. Text generation capabilities of\ncontemporary large language models (LLMs) present an opportunity to assist the\nauthoring of data-driven articles and expedite the writing process. In this\nwork, we investigate the feasibility and perceived value of leveraging LLMs to\nsupport authors of data-driven articles. We designed a prototype system,\nDataTales, that leverages a LLM to generate textual narratives accompanying a\ngiven chart. Using DataTales as a design probe, we conducted a qualitative\nstudy with 11 professionals to evaluate the concept, from which we distilled\naffordances and opportunities to further integrate LLMs as valuable data-driven\narticle authoring assistants.\n","authors":["Nicole Sultanum","Arjun Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2308.04076v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.04052v1","updated":"2023-08-08T05:16:51Z","published":"2023-08-08T05:16:51Z","title":"The Five-Dollar Model: Generating Game Maps and Sprites from Sentence\n  Embeddings","summary":"  The five-dollar model is a lightweight text-to-image generative architecture\nthat generates low dimensional images from an encoded text prompt. This model\ncan successfully generate accurate and aesthetically pleasing content in low\ndimensional domains, with limited amounts of training data. Despite the small\nsize of both the model and datasets, the generated images are still able to\nmaintain the encoded semantic meaning of the textual prompt. We apply this\nmodel to three small datasets: pixel art video game maps, video game sprite\nimages, and down-scaled emoji images and apply novel augmentation strategies to\nimprove the performance of our model on these limited datasets. We evaluate our\nmodels performance using cosine similarity score between text-image pairs\ngenerated by the CLIP VIT-B/32 model.\n","authors":["Timothy Merino","Roman Negri","Dipika Rajesh","M Charity","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.04052v1.pdf","comment":"to be published in AIIDE 2023"},{"id":"http://arxiv.org/abs/2308.04041v1","updated":"2023-08-08T04:37:41Z","published":"2023-08-08T04:37:41Z","title":"InfeRE: Step-by-Step Regex Generation via Chain of Inference","summary":"  Automatically generating regular expressions (abbrev. regexes) from natural\nlanguage description (NL2RE) has been an emerging research area. Prior studies\ntreat regex as a linear sequence of tokens and generate the final expressions\nautoregressively in a single pass. They did not take into account the\nstep-by-step internal text-matching processes behind the final results. This\nsignificantly hinders the efficacy and interpretability of regex generation by\nneural language models. In this paper, we propose a new paradigm called InfeRE,\nwhich decomposes the generation of regexes into chains of step-by-step\ninference. To enhance the robustness, we introduce a self-consistency decoding\nmechanism that ensembles multiple outputs sampled from different models. We\nevaluate InfeRE on two publicly available datasets, NL-RX-Turk and KB13, and\ncompare the results with state-of-the-art approaches and the popular tree-based\ngeneration approach TRANX. Experimental results show that InfeRE substantially\noutperforms previous baselines, yielding 16.3% and 14.7% improvement in DFA@5\naccuracy on two datasets, respectively. Particularly, InfeRE outperforms the\npopular tree-based generation approach by 18.1% and 11.3% on both datasets,\nrespectively, in terms of DFA@5 accuracy.\n","authors":["Shuai Zhang","Xiaodong Gu","Yuting Chen","Beijun Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04041v1.pdf","comment":"This paper has been accepted by ASE'23"},{"id":"http://arxiv.org/abs/2308.04037v1","updated":"2023-08-08T04:27:34Z","published":"2023-08-08T04:27:34Z","title":"A Comparative Study on TF-IDF feature Weighting Method and its Analysis\n  using Unstructured Dataset","summary":"  Text Classification is the process of categorizing text into the relevant\ncategories and its algorithms are at the core of many Natural Language\nProcessing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP\nare the most highly used information retrieval methods in text classification.\nWe have investigated and analyzed the feature weighting method for text\nclassification on unstructured data. The proposed model considered two features\nN-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset\nfor sentiment analysis. Then we have used the state-of-the-art classifier to\nvalidate the method i.e., Support Vector Machine (SVM), Logistic Regression,\nMultinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and\nk-nearest neighbors (KNN). From those two feature extractions, a significant\nincrease in feature extraction with TF-IDF features rather than based on\nN-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall\n(93.81%), and F1-score (91.99%) value in Random Forest classifier.\n","authors":["Mamata Das","Selvakumar K.","P. J. A. Alphonse"],"pdf_url":"https://arxiv.org/pdf/2308.04037v1.pdf","comment":"10 pages, 3 figures, COLINS-2021, 5th International Conference on\n  Computational Linguistics and Intelligent Systems, April 22-23, 2021,\n  Kharkiv, Ukraine"},{"id":"http://arxiv.org/abs/2307.10457v3","updated":"2023-08-08T04:18:34Z","published":"2023-07-19T21:00:16Z","title":"Improving the Reusability of Pre-trained Language Models in Real-world\n  Applications","summary":"  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is\noften limited by their generalization problem, where their performance\ndrastically decreases when evaluated on examples that differ from the training\ndataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation\narises from PLMs' reliance on spurious correlations, which work well for\nfrequent example types but not for general examples. To address this issue, we\npropose a training approach called Mask-tuning, which integrates Masked\nLanguage Modeling (MLM) training objectives into the fine-tuning process to\nenhance PLMs' generalization. Comprehensive experiments demonstrate that\nMask-tuning surpasses current state-of-the-art techniques and enhances PLMs'\ngeneralization on OOD datasets while improving their performance on\nin-distribution datasets. The findings suggest that Mask-tuning improves the\nreusability of PLMs on unseen data, making them more practical and effective\nfor real-world applications.\n","authors":["Somayeh Ghanbarzadeh","Hamid Palangi","Yan Huang","Radames Cruz Moreno","Hamed Khanpour"],"pdf_url":"https://arxiv.org/pdf/2307.10457v3.pdf","comment":"Accepted as a long paper and awarded as the BEST Resaerch Paper in\n  IEEE IRI'23 (IEEE 24th International conference on Information Reuse and\n  Integrationfor Data Science)"},{"id":"http://arxiv.org/abs/2308.04028v1","updated":"2023-08-08T04:06:11Z","published":"2023-08-08T04:06:11Z","title":"Top K Relevant Passage Retrieval for Biomedical Question Answering","summary":"  Question answering is a task that answers factoid questions using a large\ncollection of documents. It aims to provide precise answers in response to the\nuser's questions in natural language. Question answering relies on efficient\npassage retrieval to select candidate contexts, where traditional sparse vector\nspace models, such as TF-IDF or BM25, are the de facto method. On the web,\nthere is no single article that could provide all the possible answers\navailable on the internet to the question of the problem asked by the user. The\nexisting Dense Passage Retrieval model has been trained on Wikipedia dump from\nDec. 20, 2018, as the source documents for answering questions. Question\nanswering (QA) has made big strides with several open-domain and machine\ncomprehension systems built using large-scale annotated datasets. However, in\nthe clinical domain, this problem remains relatively unexplored. According to\nmultiple surveys, Biomedical Questions cannot be answered correctly from\nWikipedia Articles. In this work, we work on the existing DPR framework for the\nbiomedical domain and retrieve answers from the Pubmed articles which is a\nreliable source to answer medical questions. When evaluated on a BioASQ QA\ndataset, our fine-tuned dense retriever results in a 0.81 F1 score.\n","authors":["Shashank Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.04028v1.pdf","comment":"6 pages, 5 figures. arXiv admin note: text overlap with\n  arXiv:2004.04906 by other authors"},{"id":"http://arxiv.org/abs/2306.07848v6","updated":"2023-08-08T03:41:47Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive learning based cross-modality pretraining approaches have\nrecently exhibited impressive success in diverse fields. In this paper, we\npropose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive\nlanguage-audio pretraining (CLAP) method for speech emotion recognition.\nSpecifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing\npre-trained WavLM and RoBERTa models. Second, given the significance of the\ngender attribute in speech emotion modeling, two novel soft label based\nGEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)\nmodels are further proposed to integrate emotion and gender information of\nspeech signals, forming more reasonable objectives. Extensive experiments on\nIEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the\nbaseline Emo-CLAP, while also achieving the best recognition performance\ncompared with recent state-of-the-art methods. Noticeably, the proposed\nSL-GEmo-CLAP model achieves the best UAR of 81.43\\% and WAR of 83.16\\% which\nperforms better than other state-of-the-art SER methods by at least 3\\%.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Jixun Yao","Wen Fei","Lei Ma","Heng Lu"],"pdf_url":"https://arxiv.org/pdf/2306.07848v6.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.01681v2","updated":"2023-08-08T03:19:10Z","published":"2023-08-03T10:48:30Z","title":"NBIAS: A Natural Language Processing Framework for Bias Identification\n  in Text","summary":"  Bias in textual data can lead to skewed interpretations and outcomes when the\ndata is used. These biases could perpetuate stereotypes, discrimination, or\nother forms of unfair treatment. An algorithm trained on biased data ends up\nmaking decisions that disproportionately impact a certain group of people.\nTherefore, it is crucial to detect and remove these biases to ensure the fair\nand ethical use of data. To this end, we develop a comprehensive and robust\nframework \\textsc{Nbias} that consists of a data layer, corpus contruction,\nmodel development layer and an evaluation layer. The dataset is constructed by\ncollecting diverse data from various fields, including social media,\nhealthcare, and job hiring portals. As such, we applied a transformer-based\ntoken classification model that is able to identify bias words/ phrases through\na unique named entity. In the assessment procedure, we incorporate a blend of\nquantitative and qualitative evaluations to gauge the effectiveness of our\nmodels. We achieve accuracy improvements ranging from 1% to 8% compared to\nbaselines. We are also able to generate a robust understanding of the model\nfunctioning, capturing not only numerical data but also the quality and\nintricacies of its performance. The proposed approach is applicable to a\nvariety of biases and contributes to the fair and ethical use of textual data.\n","authors":["Shaina Raza","Muskan Garg","Deepak John Reji","Syed Raza Bashir","Chen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.01681v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.04014v1","updated":"2023-08-08T03:18:18Z","published":"2023-08-08T03:18:18Z","title":"Continual Pre-Training of Large Language Models: How to (re)warm your\n  model?","summary":"  Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to restart the process over again once new data becomes available. A much\ncheaper and more efficient solution would be to enable the continual\npre-training of these models, i.e. updating pre-trained models with new data\ninstead of re-training them from scratch. However, the distribution shift\ninduced by novel data typically results in degraded performance on past data.\nTaking a step towards efficient continual pre-training, in this work, we\nexamine the effect of different warm-up strategies. Our hypothesis is that the\nlearning rate must be re-increased to improve compute efficiency when training\non a new dataset. We study the warmup phase of models pre-trained on the Pile\n(upstream data, 300B tokens) as we continue to pre-train on SlimPajama\n(downstream data, 297B tokens), following a linear warmup and cosine decay\nschedule. We conduct all experiments on the Pythia 410M language model\narchitecture and evaluate performance through validation perplexity. We\nexperiment with different pre-training checkpoints, various maximum learning\nrates, and various warmup lengths. Our results show that while rewarming models\nfirst increases the loss on upstream and downstream data, in the longer run it\nimproves the downstream performance, outperforming models trained from\nscratch$\\unicode{x2013}$even for a large downstream dataset.\n","authors":["Kshitij Gupta","Benjamin Thérien","Adam Ibrahim","Mats L. Richter","Quentin Anthony","Eugene Belilovsky","Irina Rish","Timothée Lesort"],"pdf_url":"https://arxiv.org/pdf/2308.04014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03131v2","updated":"2023-08-08T02:01:14Z","published":"2023-08-06T14:49:26Z","title":"Towards Multiple References Era -- Addressing Data Leakage and Limited\n  Reference Diversity in NLG Evaluation","summary":"  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely\nutilized across a range of natural language generation (NLG) tasks. However,\nrecent studies have revealed a weak correlation between these matching-based\nmetrics and human evaluations, especially when compared with neural-based\nmetrics like BLEURT. In this paper, we conjecture that the performance\nbottleneck in matching-based metrics may be caused by the limited diversity of\nreferences. To address this issue, we propose to utilize \\textit{multiple\nreferences} to enhance the consistency between these metrics and human\nevaluations. Within the WMT Metrics benchmarks, we observe that the\nmulti-references F200spBLEU surpasses the conventional single-reference one by\nan accuracy improvement of 7.2\\%. Remarkably, it also exceeds the neural-based\nBERTscore by an accuracy enhancement of 3.9\\%. Moreover, we observe that the\ndata leakage issue in large language models (LLMs) can be mitigated to a large\nextent by our multi-reference metric. We release the code and data at\n\\url{https://github.com/SefaZeng/LLM-Ref}\n","authors":["Xianfeng Zeng","Yijin Liu","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.03131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03983v1","updated":"2023-08-08T02:00:43Z","published":"2023-08-08T02:00:43Z","title":"SimplyRetrieve: A Private and Lightweight Retrieval-Centric Generative\n  AI Tool","summary":"  Large Language Model (LLM) based Generative AI systems have seen significant\nprogress in recent years. Integrating a knowledge retrieval architecture allows\nfor seamless integration of private data into publicly available Generative AI\nsystems using pre-trained LLM without requiring additional model fine-tuning.\nMoreover, Retrieval-Centric Generation (RCG) approach, a promising future\nresearch direction that explicitly separates roles of LLMs and retrievers in\ncontext interpretation and knowledge memorization, potentially leads to more\nefficient implementation. SimplyRetrieve is an open-source tool with the goal\nof providing a localized, lightweight, and user-friendly interface to these\nsophisticated advancements to the machine learning community. SimplyRetrieve\nfeatures a GUI and API based RCG platform, assisted by a Private Knowledge Base\nConstructor and a Retrieval Tuning Module. By leveraging these capabilities,\nusers can explore the potential of RCG for improving generative AI performance\nwhile maintaining privacy standards. The tool is available at\nhttps://github.com/RCGAI/SimplyRetrieve with an MIT license.\n","authors":["Youyang Ng","Daisuke Miyashita","Yasuto Hoshi","Yasuhiro Morioka","Osamu Torii","Tomoya Kodama","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2308.03983v1.pdf","comment":"12 pages, 6 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.04431v1","updated":"2023-08-08T17:58:45Z","published":"2023-08-08T17:58:45Z","title":"When More is Less: Incorporating Additional Datasets Can Hurt\n  Performance By Introducing Spurious Correlations","summary":"  In machine learning, incorporating more data is often seen as a reliable\nstrategy for improving model performance; this work challenges that notion by\ndemonstrating that the addition of external datasets in many cases can hurt the\nresulting model's performance. In a large-scale empirical study across\ncombinations of four different open-source chest x-ray datasets and 9 different\nlabels, we demonstrate that in 43% of settings, a model trained on data from\ntwo hospitals has poorer worst group accuracy over both hospitals than a model\ntrained on just a single hospital's data. This surprising result occurs even\nthough the added hospital makes the training distribution more similar to the\ntest distribution. We explain that this phenomenon arises from the spurious\ncorrelation that emerges between the disease and hospital, due to\nhospital-specific image artifacts. We highlight the trade-off one encounters\nwhen training on multiple datasets, between the obvious benefit of additional\ndata and insidious cost of the introduced spurious correlation. In some cases,\nbalancing the dataset can remove the spurious correlation and improve\nperformance, but it is not always an effective strategy. We contextualize our\nresults within the literature on spurious correlations to help explain these\noutcomes. Our experiments underscore the importance of exercising caution when\nselecting training data for machine learning models, especially in settings\nwhere there is a risk of spurious correlations such as with medical imaging.\nThe risks outlined highlight the need for careful data selection and model\nevaluation in future research and practice.\n","authors":["Rhys Compton","Lily Zhang","Aahlad Puli","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2308.04431v1.pdf","comment":"Accepted at MLHC 2023"},{"id":"http://arxiv.org/abs/2308.04426v1","updated":"2023-08-08T17:55:30Z","published":"2023-08-08T17:55:30Z","title":"A Deep-Learning Method Using Auto-encoder and Generative Adversarial\n  Network for Anomaly Detection on Ancient Stone Stele Surfaces","summary":"  Accurate detection of natural deterioration and man-made damage on the\nsurfaces of ancient stele in the first instance is essential for their\npreventive conservation. Existing methods for cultural heritage preservation\nare not able to achieve this goal perfectly due to the difficulty of balancing\naccuracy, efficiency, timeliness, and cost. This paper presents a deep-learning\nmethod to automatically detect above mentioned emergencies on ancient stone\nstele in real time, employing autoencoder (AE) and generative adversarial\nnetwork (GAN). The proposed method overcomes the limitations of existing\nmethods by requiring no extensive anomaly samples while enabling comprehensive\ndetection of unpredictable anomalies. the method includes stages of monitoring,\ndata acquisition, pre-processing, model structuring, and post-processing.\nTaking the Longmen Grottoes' stone steles as a case study, an unsupervised\nlearning model based on AE and GAN architectures is proposed and validated with\na reconstruction accuracy of 99.74\\%. The method's evaluation revealed the\nproficient detection of seven artificially designed anomalies and demonstrated\nprecision and reliability without false alarms. This research provides novel\nideas and possibilities for the application of deep learning in the field of\ncultural heritage.\n","authors":["Yikun Liu","Yuning Wang","Cheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04170v3","updated":"2023-08-08T17:49:29Z","published":"2023-05-07T03:00:06Z","title":"YOLOCS: Object Detection based on Dense Channel Compression for Feature\n  Spatial Solidification","summary":"  In this study, we examine the associations between channel features and\nconvolutional kernels during the processes of feature purification and gradient\nbackpropagation, with a focus on the forward and backward propagation within\nthe network. Consequently, we propose a method called Dense Channel Compression\nfor Feature Spatial Solidification. Drawing upon the central concept of this\nmethod, we introduce two innovative modules for backbone and head networks: the\nDense Channel Compression for Feature Spatial Solidification Structure (DCFS)\nand the Asymmetric Multi-Level Compression Decoupled Head (ADH). When\nintegrated into the YOLOv5 model, these two modules demonstrate exceptional\nperformance, resulting in a modified model referred to as YOLOCS. Evaluated on\nthe MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of\n50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably\nsimilar to those of the YOLOv5 model, the large, medium, and small YOLOCS\nmodels surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively.\n","authors":["Lin Huang","Weisheng Li","Linlin Shen","Haojie Fu","Xue Xiao","Suihan Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.04170v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04417v1","updated":"2023-08-08T17:34:28Z","published":"2023-08-08T17:34:28Z","title":"DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from\n  Optical Satellite Images","summary":"  Optical satellite images are a critical data source; however, cloud cover\noften compromises their quality, hindering image applications and analysis.\nConsequently, effectively removing clouds from optical satellite images has\nemerged as a prominent research direction. While recent advancements in cloud\nremoval primarily rely on generative adversarial networks, which may yield\nsuboptimal image quality, diffusion models have demonstrated remarkable success\nin diverse image-generation tasks, showcasing their potential in addressing\nthis challenge. This paper presents a novel framework called DiffCR, which\nleverages conditional guided diffusion with deep convolutional networks for\nhigh-performance cloud removal for optical satellite imagery. Specifically, we\nintroduce a decoupled encoder for conditional image feature extraction,\nproviding a robust color representation to ensure the close similarity of\nappearance information between the conditional input and the synthesized\noutput. Moreover, we propose a novel and efficient time and condition fusion\nblock within the cloud removal model to accurately simulate the correspondence\nbetween the appearance in the conditional image and the target image at a low\ncomputational cost. Extensive experimental evaluations on two commonly used\nbenchmark datasets demonstrate that DiffCR consistently achieves\nstate-of-the-art performance on all metrics, with parameter and computational\ncomplexities amounting to only 5.1% and 5.4%, respectively, of those previous\nbest methods. The source code, pre-trained models, and all the experimental\nresults will be publicly available at https://github.com/XavierJiezou/DiffCR\nupon the paper's acceptance of this work.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Yu Zhang","Shiying Wang","Lei Jin","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.04417v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.09345v2","updated":"2023-08-08T17:26:58Z","published":"2023-06-15T17:59:51Z","title":"Evaluating Data Attribution for Text-to-Image Models","summary":"  While large text-to-image models are able to synthesize \"novel\" images, these\nimages are necessarily a reflection of the training data. The problem of data\nattribution in such models -- which of the images in the training set are most\nresponsible for the appearance of a given generated image -- is a difficult yet\nimportant one. As an initial step toward this problem, we evaluate attribution\nthrough \"customization\" methods, which tune an existing large-scale model\ntoward a given exemplar object or style. Our key insight is that this allows us\nto efficiently create synthetic images that are computationally influenced by\nthe exemplar by construction. With our new dataset of such exemplar-influenced\nimages, we are able to evaluate various data attribution algorithms and\ndifferent possible feature spaces. Furthermore, by training on our dataset, we\ncan tune standard models, such as DINO, CLIP, and ViT, toward the attribution\nproblem. Even though the procedure is tuned towards small exemplar sets, we\nshow generalization to larger sets. Finally, by taking into account the\ninherent uncertainty of the problem, we can assign soft attribution scores over\na set of training images.\n","authors":["Sheng-Yu Wang","Alexei A. Efros","Jun-Yan Zhu","Richard Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09345v2.pdf","comment":"Updated v2 -- ICCV 2023 camera ready version. Project page:\n  https://peterwang512.github.io/GenDataAttribution Code:\n  https://github.com/PeterWang512/GenDataAttribution"},{"id":"http://arxiv.org/abs/2308.04413v1","updated":"2023-08-08T17:18:59Z","published":"2023-08-08T17:18:59Z","title":"Digging into Depth Priors for Outdoor Neural Radiance Fields","summary":"  Neural Radiance Fields (NeRF) have demonstrated impressive performance in\nvision and graphics tasks, such as novel view synthesis and immersive reality.\nHowever, the shape-radiance ambiguity of radiance fields remains a challenge,\nespecially in the sparse viewpoints setting. Recent work resorts to integrating\ndepth priors into outdoor NeRF training to alleviate the issue. However, the\ncriteria for selecting depth priors and the relative merits of different priors\nhave not been thoroughly investigated. Moreover, the relative merits of\nselecting different approaches to use the depth priors is also an unexplored\nproblem. In this paper, we provide a comprehensive study and evaluation of\nemploying depth priors to outdoor neural radiance fields, covering common depth\nsensing technologies and most application ways. Specifically, we conduct\nextensive experiments with two representative NeRF methods equipped with four\ncommonly-used depth priors and different depth usages on two widely used\noutdoor datasets. Our experimental results reveal several interesting findings\nthat can potentially benefit practitioners and researchers in training their\nNeRF models with depth priors. Project Page:\nhttps://cwchenwang.github.io/outdoor-nerf-depth\n","authors":["Chen Wang","Jiadai Sun","Lina Liu","Chenming Wu","Zhelun Shen","Dayan Wu","Yuchao Dai","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04413v1.pdf","comment":"Accepted to ACM MM 2023. Project Page:\n  https://cwchenwang.github.io/outdoor-nerf-depth"},{"id":"http://arxiv.org/abs/2308.04409v1","updated":"2023-08-08T17:14:14Z","published":"2023-08-08T17:14:14Z","title":"V-DETR: DETR with Vertex Relative Position Encoding for 3D Object\n  Detection","summary":"  We introduce a highly performant 3D object detector for point clouds using\nthe DETR framework. The prior attempts all end up with suboptimal results\nbecause they fail to learn accurate inductive biases from the limited scale of\ntraining data. In particular, the queries often attend to points that are far\naway from the target objects, violating the locality principle in object\ndetection. To address the limitation, we introduce a novel 3D Vertex Relative\nPosition Encoding (3DV-RPE) method which computes position encoding for each\npoint based on its relative position to the 3D boxes predicted by the queries\nin each decoder layer, thus providing clear information to guide the model to\nfocus on points near the objects, in accordance with the principle of locality.\nIn addition, we systematically improve the pipeline from various aspects such\nas data normalization based on our understanding of the task. We show\nexceptional results on the challenging ScanNetV2 benchmark, achieving\nsignificant improvements over the previous 3DETR in\n$\\rm{AP}_{25}$/$\\rm{AP}_{50}$ from 65.0\\%/47.0\\% to 77.8\\%/66.0\\%,\nrespectively. In addition, our method sets a new record on ScanNetV2 and SUN\nRGB-D datasets.Code will be released at http://github.com/yichaoshen-MS/V-DETR.\n","authors":["Yichao Shen","Zigang Geng","Yuhui Yuan","Yutong Lin","Ze Liu","Chunyu Wang","Han Hu","Nanning Zheng","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2308.04409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04402v1","updated":"2023-08-08T17:04:53Z","published":"2023-08-08T17:04:53Z","title":"Person Re-Identification without Identification via Event Anonymization","summary":"  Wide-scale use of visual surveillance in public spaces puts individual\nprivacy at stake while increasing resource consumption (energy, bandwidth, and\ncomputation). Neuromorphic vision sensors (event-cameras) have been recently\nconsidered a valid solution to the privacy issue because they do not capture\ndetailed RGB visual information of the subjects in the scene. However, recent\ndeep learning architectures have been able to reconstruct images from event\ncameras with high fidelity, reintroducing a potential threat to privacy for\nevent-based vision applications. In this paper, we aim to anonymize\nevent-streams to protect the identity of human subjects against such image\nreconstruction attacks. To achieve this, we propose an end-to-end network\narchitecture jointly optimized for the twofold objective of preserving privacy\nand performing a downstream task such as person ReId. Our network learns to\nscramble events, enforcing the degradation of images recovered from the privacy\nattacker. In this work, we also bring to the community the first ever\nevent-based person ReId dataset gathered to evaluate the performance of our\napproach. We validate our approach with extensive experiments and report\nresults on the synthetic event data simulated from the publicly available\nSoftBio dataset and our proposed Event-ReId dataset.\n","authors":["Shafiq Ahmad","Pietro Morerio","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.04402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04397v1","updated":"2023-08-08T17:01:33Z","published":"2023-08-08T17:01:33Z","title":"LEFormer: A Hybrid CNN-Transformer Architecture for Accurate Lake\n  Extraction from Remote Sensing Imagery","summary":"  Lake extraction from remote sensing imagery is challenging due to the complex\nshapes of lakes and the presence of noise. Existing methods suffer from blurred\nsegmentation boundaries and poor foreground modeling. In this paper, we propose\na hybrid CNN-Transformer architecture, called LEFormer, for accurate lake\nextraction. LEFormer contains four main modules: CNN encoder, Transformer\nencoder, cross-encoder fusion, and lightweight decoder. The CNN encoder\nrecovers local spatial information and improves fine-scale details.\nSimultaneously, the Transformer encoder captures long-range dependencies\nbetween sequences of any length, allowing them to obtain global features and\ncontext information better. Finally, a lightweight decoder is employed for mask\nprediction. We evaluate the performance and efficiency of LEFormer on two\ndatasets, the Surface Water (SW) and the Qinghai-Tibet Plateau Lake (QTPL).\nExperimental results show that LEFormer consistently achieves state-of-the-art\n(SOTA) performance and efficiency on these two datasets, outperforming existing\nmethods. Specifically, LEFormer achieves 90.86% and 97.42% mIoU on the SW and\nQTPL datasets with a parameter count of 3.61M, respectively, while being 20x\nminor than the previous SOTA method.\n","authors":["Ben Chen","Xuechao Zou","Yu Zhang","Jiayu Li","Kai Li","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.04397v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04395v1","updated":"2023-08-08T17:00:11Z","published":"2023-08-08T17:00:11Z","title":"Data Augmentation-Based Unsupervised Domain Adaptation In Medical\n  Imaging","summary":"  Deep learning-based models in medical imaging often struggle to generalize\neffectively to new scans due to data heterogeneity arising from differences in\nhardware, acquisition parameters, population, and artifacts. This limitation\npresents a significant challenge in adopting machine learning models for\nclinical practice. We propose an unsupervised method for robust domain\nadaptation in brain MRI segmentation by leveraging MRI-specific augmentation\ntechniques. To evaluate the effectiveness of our method, we conduct extensive\nexperiments across diverse datasets, modalities, and segmentation tasks,\ncomparing against the state-of-the-art methods. The results show that our\nproposed approach achieves high accuracy, exhibits broad applicability, and\nshowcases remarkable robustness against domain shift in various tasks,\nsurpassing the state-of-the-art performance in the majority of cases.\n","authors":["Sebastian Nørgaard Llambias","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2308.04395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04383v1","updated":"2023-08-08T16:37:24Z","published":"2023-08-08T16:37:24Z","title":"DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point\n  Clouds","summary":"  Point clouds are naturally sparse, while image pixels are dense. The\ninconsistency limits feature fusion from both modalities for point-wise scene\nflow estimation. Previous methods rarely predict scene flow from the entire\npoint clouds of the scene with one-time inference due to the memory\ninefficiency and heavy overhead from distance calculation and sorting involved\nin commonly used farthest point sampling, KNN, and ball query algorithms for\nlocal feature aggregation. To mitigate these issues in scene flow learning, we\nregularize raw points to a dense format by storing 3D coordinates in 2D grids.\nUnlike the sampling operation commonly used in existing works, the dense 2D\nrepresentation 1) preserves most points in the given scene, 2) brings in a\nsignificant boost of efficiency, and 3) eliminates the density gap between\npoints and pixels, allowing us to perform effective feature fusion. We also\npresent a novel warping projection technique to alleviate the information loss\nproblem resulting from the fact that multiple points could be mapped into one\ngrid during projection when computing cost volume. Sufficient experiments\ndemonstrate the efficiency and effectiveness of our method, outperforming the\nprior-arts on the FlyingThings3D and KITTI dataset.\n","authors":["Chensheng Peng","Guangming Wang","Xian Wan Lo","Xinrui Wu","Chenfeng Xu","Masayoshi Tomizuka","Wei Zhan","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04383v1.pdf","comment":"Accepted by ICCV2023. Codes will be released at\n  https://github.com/IRMVLab/DELFlow"},{"id":"http://arxiv.org/abs/2308.04380v1","updated":"2023-08-08T16:31:43Z","published":"2023-08-08T16:31:43Z","title":"Your Negative May not Be True Negative: Boosting Image-Text Matching\n  with False Negative Elimination","summary":"  Most existing image-text matching methods adopt triplet loss as the\noptimization objective, and choosing a proper negative sample for the triplet\nof <anchor, positive, negative> is important for effectively training the\nmodel, e.g., hard negatives make the model learn efficiently and effectively.\nHowever, we observe that existing methods mainly employ the most similar\nsamples as hard negatives, which may not be true negatives. In other words, the\nsamples with high similarity but not paired with the anchor may reserve\npositive semantic associations, and we call them false negatives. Repelling\nthese false negatives in triplet loss would mislead the semantic representation\nlearning and result in inferior retrieval performance. In this paper, we\npropose a novel False Negative Elimination (FNE) strategy to select negatives\nvia sampling, which could alleviate the problem introduced by false negatives.\nSpecifically, we first construct the distributions of positive and negative\nsamples separately via their similarities with the anchor, based on the\nfeatures extracted from image and text encoders. Then we calculate the false\nnegative probability of a given sample based on its similarity with the anchor\nand the above distributions via the Bayes' rule, which is employed as the\nsampling weight during negative sampling process. Since there may not exist any\nfalse negative in a small batch size, we design a memory module with momentum\nto retain a large negative buffer and implement our negative sampling strategy\nspanning over the buffer. In addition, to make the model focus on hard\nnegatives, we reassign the sampling weights for the simple negatives with a\ncut-down strategy. The extensive experiments are conducted on Flickr30K and\nMS-COCO, and the results demonstrate the superiority of our proposed false\nnegative elimination strategy. The code is available at\nhttps://github.com/LuminosityX/FNE.\n","authors":["Haoxuan Li","Yi Bin","Junrong Liao","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04380v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04373v1","updated":"2023-08-08T16:22:44Z","published":"2023-08-08T16:22:44Z","title":"Pelta: Shielding Transformers to Mitigate Evasion Attacks in Federated\n  Learning","summary":"  The main premise of federated learning is that machine learning model updates\nare computed locally, in particular to preserve user data privacy, as those\nnever leave the perimeter of their device. This mechanism supposes the general\nmodel, once aggregated, to be broadcast to collaborating and non malicious\nnodes. However, without proper defenses, compromised clients can easily probe\nthe model inside their local memory in search of adversarial examples. For\ninstance, considering image-based applications, adversarial examples consist of\nimperceptibly perturbed images (to the human eye) misclassified by the local\nmodel, which can be later presented to a victim node's counterpart model to\nreplicate the attack. To mitigate such malicious probing, we introduce Pelta, a\nnovel shielding mechanism leveraging trusted hardware. By harnessing the\ncapabilities of Trusted Execution Environments (TEEs), Pelta masks part of the\nback-propagation chain rule, otherwise typically exploited by attackers for the\ndesign of malicious samples. We evaluate Pelta on a state of the art ensemble\nmodel and demonstrate its effectiveness against the Self Attention Gradient\nadversarial Attack.\n","authors":["Simon Queyrut","Yérom-David Bromberg","Valerio Schiavoni"],"pdf_url":"https://arxiv.org/pdf/2308.04373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04370v1","updated":"2023-08-08T16:17:46Z","published":"2023-08-08T16:17:46Z","title":"When Super-Resolution Meets Camouflaged Object Detection: A Comparison\n  Study","summary":"  Super Resolution (SR) and Camouflaged Object Detection (COD) are two hot\ntopics in computer vision with various joint applications. For instance,\nlow-resolution surveillance images can be successively processed by\nsuper-resolution techniques and camouflaged object detection. However, in\nprevious work, these two areas are always studied in isolation. In this paper,\nwe, for the first time, conduct an integrated comparative evaluation for both.\nSpecifically, we benchmark different super-resolution methods on commonly used\nCOD datasets, and meanwhile, we evaluate the robustness of different COD models\nby using COD data processed by SR methods. Our goal is to bridge these two\ndomains, discover novel experimental phenomena, summarize new experim.\n","authors":["Juan Wen","Shupeng Cheng","Peng Xu","Bowen Zhou","Radu Timofte","Weiyan Hou","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2308.04370v1.pdf","comment":"23 pages with 8 figures"},{"id":"http://arxiv.org/abs/2308.04369v1","updated":"2023-08-08T16:15:35Z","published":"2023-08-08T16:15:35Z","title":"SSTFormer: Bridging Spiking Neural Network and Memory Support\n  Transformer for Frame-Event based Recognition","summary":"  Event camera-based pattern recognition is a newly arising research topic in\nrecent years. Current researchers usually transform the event streams into\nimages, graphs, or voxels, and adopt deep neural networks for event-based\nclassification. Although good performance can be achieved on simple event\nrecognition datasets, however, their results may be still limited due to the\nfollowing two issues. Firstly, they adopt spatial sparse event streams for\nrecognition only, which may fail to capture the color and detailed texture\ninformation well. Secondly, they adopt either Spiking Neural Networks (SNN) for\nenergy-efficient recognition with suboptimal results, or Artificial Neural\nNetworks (ANN) for energy-intensive, high-performance recognition. However,\nseldom of them consider achieving a balance between these two aspects. In this\npaper, we formally propose to recognize patterns by fusing RGB frames and event\nstreams simultaneously and propose a new RGB frame-event recognition framework\nto address the aforementioned issues. The proposed method contains four main\nmodules, i.e., memory support Transformer network for RGB frame encoding,\nspiking neural network for raw event stream encoding, multi-modal bottleneck\nfusion module for RGB-Event feature aggregation, and prediction head. Due to\nthe scarce of RGB-Event based classification dataset, we also propose a\nlarge-scale PokerEvent dataset which contains 114 classes, and 27102\nframe-event pairs recorded using a DVS346 event camera. Extensive experiments\non two RGB-Event based classification datasets fully validated the\neffectiveness of our proposed framework. We hope this work will boost the\ndevelopment of pattern recognition by fusing RGB frames and event streams. Both\nour dataset and source code of this work will be released at\nhttps://github.com/Event-AHU/SSTFormer.\n","authors":["Xiao Wang","Zongzhen Wu","Yao Rong","Lin Zhu","Bo Jiang","Jin Tang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04369v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2303.09040v2","updated":"2023-08-08T16:14:32Z","published":"2023-03-16T02:24:31Z","title":"Hybrid Spectral Denoising Transformer with Guided Attention","summary":"  In this paper, we present a Hybrid Spectral Denoising Transformer (HSDT) for\nhyperspectral image denoising. Challenges in adapting transformer for HSI arise\nfrom the capabilities to tackle existing limitations of CNN-based methods in\ncapturing the global and local spatial-spectral correlations while maintaining\nefficiency and flexibility. To address these issues, we introduce a hybrid\napproach that combines the advantages of both models with a Spatial-Spectral\nSeparable Convolution (S3Conv), Guided Spectral Self-Attention (GSSA), and\nSelf-Modulated Feed-Forward Network (SM-FFN). Our S3Conv works as a lightweight\nalternative to 3D convolution, which extracts more spatial-spectral correlated\nfeatures while keeping the flexibility to tackle HSIs with an arbitrary number\nof bands. These features are then adaptively processed by GSSA which per-forms\n3D self-attention across the spectral bands, guided by a set of learnable\nqueries that encode the spectral signatures. This not only enriches our model\nwith powerful capabilities for identifying global spectral correlations but\nalso maintains linear complexity. Moreover, our SM-FFN proposes the\nself-modulation that intensifies the activations of more informative regions,\nwhich further strengthens the aggregated features. Extensive experiments are\nconducted on various datasets under both simulated and real-world noise, and it\nshows that our HSDT significantly outperforms the existing state-of-the-art\nmethods while maintaining low computational overhead. Code is at https:\n//github.com/Zeqiang-Lai/HSDT.\n","authors":["Zeqiang Lai","Chenggang Yan","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09040v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.07916v3","updated":"2023-08-08T16:06:11Z","published":"2023-04-16T23:37:24Z","title":"GaitRef: Gait Recognition with Refined Sequential Skeletons","summary":"  Identifying humans with their walking sequences, known as gait recognition,\nis a useful biometric understanding task as it can be observed from a long\ndistance and does not require cooperation from the subject. Two common\nmodalities used for representing the walking sequence of a person are\nsilhouettes and joint skeletons. Silhouette sequences, which record the\nboundary of the walking person in each frame, may suffer from the variant\nappearances from carried-on objects and clothes of the person. Framewise joint\ndetections are noisy and introduce some jitters that are not consistent with\nsequential detections. In this paper, we combine the silhouettes and skeletons\nand refine the framewise joint predictions for gait recognition. With temporal\ninformation from the silhouette sequences, we show that the refined skeletons\ncan improve gait recognition performance without extra annotations. We compare\nour methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show\nstate-of-the-art performance.\n","authors":["Haidong Zhu","Wanrong Zheng","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2304.07916v3.pdf","comment":"IJCB 2023 oral. Code is available at\n  https://github.com/haidongz-usc/GaitRef"},{"id":"http://arxiv.org/abs/2303.16565v2","updated":"2023-08-08T16:01:41Z","published":"2023-03-29T09:47:48Z","title":"PMAA: A Progressive Multi-scale Attention Autoencoder Model for\n  High-performance Cloud Removal from Multi-temporal Satellite Imagery","summary":"  Satellite imagery analysis plays a pivotal role in remote sensing; however,\ninformation loss due to cloud cover significantly impedes its application.\nAlthough existing deep cloud removal models have achieved notable outcomes,\nthey scarcely consider contextual information. This study introduces a\nhigh-performance cloud removal architecture, termed Progressive Multi-scale\nAttention Autoencoder (PMAA), which concurrently harnesses global and local\ninformation to construct robust contextual dependencies using a novel\nMulti-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).\nPMAA establishes long-range dependencies of multi-scale features using MAM and\nmodulates the reconstruction of fine-grained details utilizing LIM, enabling\nsimultaneous representation of fine- and coarse-grained features at the same\nlevel. With the help of diverse and multi-scale features, PMAA consistently\noutperforms the previous state-of-the-art model CTGAN on two benchmark\ndatasets. Moreover, PMAA boasts considerable efficiency advantages, with only\n0.5% and 14.6% of the parameters and computational complexity of CTGAN,\nrespectively. These comprehensive results underscore PMAA's potential as a\nlightweight cloud removal network suitable for deployment on edge devices to\naccomplish large-scale cloud removal tasks. Our source code and pre-trained\nmodels are available at https://github.com/XavierJiezou/PMAA.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Pin Tao","Yachao Cui"],"pdf_url":"https://arxiv.org/pdf/2303.16565v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.04356v1","updated":"2023-08-08T16:01:11Z","published":"2023-08-08T16:01:11Z","title":"Learning Unbiased Image Segmentation: A Case Study with Plain Knee\n  Radiographs","summary":"  Automatic segmentation of knee bony anatomy is essential in orthopedics, and\nit has been around for several years in both pre-operative and post-operative\nsettings. While deep learning algorithms have demonstrated exceptional\nperformance in medical image analysis, the assessment of fairness and potential\nbiases within these models remains limited. This study aims to revisit deep\nlearning-powered knee-bony anatomy segmentation using plain radiographs to\nuncover visible gender and racial biases. The current contribution offers the\npotential to advance our understanding of biases, and it provides practical\ninsights for researchers and practitioners in medical imaging. The proposed\nmitigation strategies mitigate gender and racial biases, ensuring fair and\nunbiased segmentation results. Furthermore, this work promotes equal access to\naccurate diagnoses and treatment outcomes for diverse patient populations,\nfostering equitable and inclusive healthcare provision.\n","authors":["Nickolas Littlefield","Johannes F. Plate","Kurt R. Weiss","Ines Lohse","Avani Chhabra","Ismaeel A. Siddiqui","Zoe Menezes","George Mastorakos","Sakshi Mehul Thakar","Mehrnaz Abedian","Matthew F. Gong","Luke A. Carlson","Hamidreza Moradi","Soheyla Amirian","Ahmad P. Tafti"],"pdf_url":"https://arxiv.org/pdf/2308.04356v1.pdf","comment":"This paper has been accepted by IEEE BHI 2023"},{"id":"http://arxiv.org/abs/2308.04352v1","updated":"2023-08-08T15:59:17Z","published":"2023-08-08T15:59:17Z","title":"3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment","summary":"  3D vision-language grounding (3D-VL) is an emerging field that aims to\nconnect the 3D physical world with natural language, which is crucial for\nachieving embodied intelligence. Current 3D-VL models rely heavily on\nsophisticated modules, auxiliary losses, and optimization tricks, which calls\nfor a simple and unified model. In this paper, we propose 3D-VisTA, a\npre-trained Transformer for 3D Vision and Text Alignment that can be easily\nadapted to various downstream tasks. 3D-VisTA simply utilizes self-attention\nlayers for both single-modal modeling and multi-modal fusion without any\nsophisticated task-specific design. To further enhance its performance on 3D-VL\ntasks, we construct ScanScribe, the first large-scale 3D scene-text pairs\ndataset for 3D-VL pre-training. ScanScribe contains 2,995 RGB-D scans for 1,185\nunique indoor scenes originating from ScanNet and 3R-Scan datasets, along with\npaired 278K scene descriptions generated from existing 3D-VL tasks, templates,\nand GPT-3. 3D-VisTA is pre-trained on ScanScribe via masked language/object\nmodeling and scene-text matching. It achieves state-of-the-art results on\nvarious 3D-VL tasks, ranging from visual grounding and dense captioning to\nquestion answering and situated reasoning. Moreover, 3D-VisTA demonstrates\nsuperior data efficiency, obtaining strong performance even with limited\nannotations during downstream task fine-tuning.\n","authors":["Ziyu Zhu","Xiaojian Ma","Yixin Chen","Zhidong Deng","Siyuan Huang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2308.04352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16181v3","updated":"2023-08-08T15:50:35Z","published":"2023-06-28T13:03:43Z","title":"Learning to Pan-sharpening with Memories of Spatial Details","summary":"  Pan-sharpening, as one of the most commonly used techniques in remote sensing\nsystems, aims to inject spatial details from panchromatic images into\nmultispectral images (MS) to obtain high-resolution multispectral images. Since\ndeep learning has received widespread attention because of its powerful fitting\nability and efficient feature extraction, a variety of pan-sharpening methods\nhave been proposed to achieve remarkable performance. However, current\npan-sharpening methods usually require the paired panchromatic (PAN) and MS\nimages as input, which limits their usage in some scenarios. To address this\nissue, in this paper we observe that the spatial details from PAN images are\nmainly high-frequency cues, i.e., the edges reflect the contour of input PAN\nimages. This motivates us to develop a PAN-agnostic representation to store\nsome base edges, so as to compose the contour for the corresponding PAN image\nvia them. As a result, we can perform the pan-sharpening task with only the MS\nimage when inference. To this end, a memory-based network is adapted to extract\nand memorize the spatial details during the training phase and is used to\nreplace the process of obtaining spatial information from PAN images when\ninference, which is called Memory-based Spatial Details Network (MSDN).\nFinally, we integrate the proposed MSDN module into the existing deep\nlearning-based pan-sharpening methods to achieve an end-to-end pan-sharpening\nnetwork. With extensive experiments on the Gaofen1 and WorldView-4 satellites,\nwe verify that our method constructs good spatial details without PAN images\nand achieves the best performance. The code is available at\nhttps://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.\n","authors":["Maoxun Yuan","Tianyi Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2306.16181v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04517v2","updated":"2023-08-08T15:50:11Z","published":"2023-05-08T07:22:37Z","title":"DiffBFR: Bootstrapping Diffusion Model Towards Blind Face Restoration","summary":"  Blind face restoration (BFR) is important while challenging. Prior works\nprefer to exploit GAN-based frameworks to tackle this task due to the balance\nof quality and efficiency. However, these methods suffer from poor stability\nand adaptability to long-tail distribution, failing to simultaneously retain\nsource identity and restore detail. We propose DiffBFR to introduce Diffusion\nProbabilistic Model (DPM) for BFR to tackle the above problem, given its\nsuperiority over GAN in aspects of avoiding training collapse and generating\nlong-tail distribution. DiffBFR utilizes a two-step design, that first restores\nidentity information from low-quality images and then enhances texture details\naccording to the distribution of real faces. This design is implemented with\ntwo key components: 1) Identity Restoration Module (IRM) for preserving the\nface details in results. Instead of denoising from pure Gaussian random\ndistribution with LQ images as the condition during the reverse process, we\npropose a novel truncated sampling method which starts from LQ images with part\nnoise added. We theoretically prove that this change shrinks the evidence lower\nbound of DPM and then restores more original details. With theoretical proof,\ntwo cascade conditional DPMs with different input sizes are introduced to\nstrengthen this sampling effect and reduce training difficulty in the\nhigh-resolution image generated directly. 2) Texture Enhancement Module (TEM)\nfor polishing the texture of the image. Here an unconditional DPM, a LQ-free\nmodel, is introduced to further force the restorations to appear realistic. We\ntheoretically proved that this unconditional DPM trained on pure HQ images\ncontributes to justifying the correct distribution of inference images output\nfrom IRM in pixel-level space. Truncated sampling with fractional time step is\nutilized to polish pixel-level textures while preserving identity information.\n","authors":["Xinmin Qiu","Congying Han","Zicheng Zhang","Bonan Li","Tiande Guo","Xuecheng Nie"],"pdf_url":"https://arxiv.org/pdf/2305.04517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04343v1","updated":"2023-08-08T15:43:59Z","published":"2023-08-08T15:43:59Z","title":"Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval","summary":"  Most existing cross-modal retrieval methods employ two-stream encoders with\ndifferent architectures for images and texts, \\textit{e.g.}, CNN for images and\nRNN/Transformer for texts. Such discrepancy in architectures may induce\ndifferent semantic distribution spaces and limit the interactions between\nimages and texts, and further result in inferior alignment between images and\ntexts. To fill this research gap, inspired by recent advances of Transformers\nin vision tasks, we propose to unify the encoder architectures with\nTransformers for both modalities. Specifically, we design a cross-modal\nretrieval framework purely based on two-stream Transformers, dubbed\n\\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image\nTransformer, a text Transformer, and a hierarchical alignment module. With such\nidentical architectures, the encoders could produce representations with more\nsimilar characteristics for images and texts, and make the interactions and\nalignments between them much easier. Besides, to leverage the rich semantics,\nwe devise a hierarchical alignment scheme to explore multi-level\ncorrespondences of different layers between images and texts. To evaluate the\neffectiveness of the proposed HAT, we conduct extensive experiments on two\nbenchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that\nHAT outperforms SOTA baselines by a large margin. Specifically, on two key\ntasks, \\textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves\n7.6\\% and 16.7\\% relative score improvement of Recall@1 on MSCOCO, and 4.4\\%\nand 11.6\\% on Flickr30k respectively. The code is available at\n\\url{https://github.com/LuminosityX/HAT}.\n","authors":["Yi Bin","Haoxuan Li","Yahui Xu","Xing Xu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04343v1.pdf","comment":"Accepted at ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04340v1","updated":"2023-08-08T15:36:57Z","published":"2023-08-08T15:36:57Z","title":"A Lightweight and Accurate Face Detection Algorithm Based on Retinaface","summary":"  In this paper, we propose a lightweight and accurate face detection algorithm\nLAFD (Light and accurate face detection) based on Retinaface. Backbone network\nin the algorithm is a modified MobileNetV3 network which adjusts the size of\nthe convolution kernel, the channel expansion multiplier of the inverted\nresiduals block and the use of the SE attention mechanism. Deformable\nconvolution network(DCN) is introduced in the context module and the algorithm\nuses focal loss function instead of cross-entropy loss function as the\nclassification loss function of the model. The test results on the WIDERFACE\ndataset indicate that the average accuracy of LAFD is 94.1%, 92.2% and 82.1%\nfor the \"easy\", \"medium\" and \"hard\" validation subsets respectively with an\nimprovement of 3.4%, 4.0% and 8.3% compared to Retinaface and 3.1%, 4.1% and\n4.1% higher than the well-performing lightweight model, LFFD. If the input\nimage is pre-processed and scaled to 1560px in length or 1200px in width, the\nmodel achieves an average accuracy of 86.2% on the 'hard' validation subset.\nThe model is lightweight, with a size of only 10.2MB.\n","authors":["Baozhu Liu","Hewei Yu"],"pdf_url":"https://arxiv.org/pdf/2308.04340v1.pdf","comment":"14 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2308.04337v1","updated":"2023-08-08T15:30:08Z","published":"2023-08-08T15:30:08Z","title":"Pengembangan Model untuk Mendeteksi Kerusakan pada Terumbu Karang dengan\n  Klasifikasi Citra","summary":"  The abundant biodiversity of coral reefs in Indonesian waters is a valuable\nasset that needs to be preserved. Rapid climate change and uncontrolled human\nactivities have led to the degradation of coral reef ecosystems, including\ncoral bleaching, which is a critical indicator of coral health conditions.\nTherefore, this research aims to develop an accurate classification model to\ndistinguish between healthy corals and corals experiencing bleaching. This\nstudy utilizes a specialized dataset consisting of 923 images collected from\nFlickr using the Flickr API. The dataset comprises two distinct classes:\nhealthy corals (438 images) and bleached corals (485 images). These images have\nbeen resized to a maximum of 300 pixels in width or height, whichever is\nlarger, to maintain consistent sizes across the dataset.\n  The method employed in this research involves the use of machine learning\nmodels, particularly convolutional neural networks (CNN), to recognize and\ndifferentiate visual patterns associated with healthy and bleached corals. In\nthis context, the dataset can be used to train and test various classification\nmodels to achieve optimal results. By leveraging the ResNet model, it was found\nthat a from-scratch ResNet model can outperform pretrained models in terms of\nprecision and accuracy. The success in developing accurate classification\nmodels will greatly benefit researchers and marine biologists in gaining a\nbetter understanding of coral reef health. These models can also be employed to\nmonitor changes in the coral reef environment, thereby making a significant\ncontribution to conservation and ecosystem restoration efforts that have\nfar-reaching impacts on life.\n","authors":["Fadhil Muhammad","Alif Bintang Elfandra","Iqbal Pahlevi Amin","Alfan Farizki Wicaksono"],"pdf_url":"https://arxiv.org/pdf/2308.04337v1.pdf","comment":"in Indonesian language"},{"id":"http://arxiv.org/abs/2305.12522v2","updated":"2023-08-08T15:22:26Z","published":"2023-05-21T17:46:28Z","title":"P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic\n  Segmentation","summary":"  To mitigate the necessity for large amounts of supervised segmentation\nannotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)\nstrategies have been devised. These will often rely on advanced data and model\nregularization strategies to instigate the development of useful properties\n(e.g., prediction completeness and fidelity to semantic boundaries) in\nsegmentation priors, notwithstanding the lack of annotated information. In this\nwork, we first create a strong baseline by analyzing complementary WSSS\ntechniques and regularizing strategies, considering their strengths and\nlimitations. We then propose a new Class-specific Adversarial Erasing strategy,\ncomprising two adversarial CAM generating networks being gradually refined to\nproduce robust semantic segmentation proposals. Empirical results suggest that\nour approach induces substantial improvement in the effectiveness of the\nbaseline, resulting in a noticeable improvement over both Pascal VOC 2012 and\nMS COCO 2014 datasets.\n","authors":["Lucas David","Helio Pedrini","Zanoni Dias"],"pdf_url":"https://arxiv.org/pdf/2305.12522v2.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04322v1","updated":"2023-08-08T15:15:51Z","published":"2023-08-08T15:15:51Z","title":"Domain Adaptive Person Search via GAN-based Scene Synthesis for\n  Cross-scene Videos","summary":"  Person search has recently been a challenging task in the computer vision\ndomain, which aims to search specific pedestrians from real\ncameras.Nevertheless, most surveillance videos comprise only a handful of\nimages of each pedestrian, which often feature identical backgrounds and\nclothing. Hence, it is difficult to learn more discriminative features for\nperson search in real scenes. To tackle this challenge, we draw on Generative\nAdversarial Networks (GAN) to synthesize data from surveillance videos. GAN has\nthrived in computer vision problems because it produces high-quality images\nefficiently. We merely alter the popular Fast R-CNN model, which is capable of\nprocessing videos and yielding accurate detection outcomes. In order to\nappropriately relieve the pressure brought by the two-stage model, we design an\nAssisted-Identity Query Module (AIDQ) to provide positive images for the behind\npart. Besides, the proposed novel GAN-based Scene Synthesis model that can\nsynthesize high-quality cross-id person images for person search tasks. In\norder to facilitate the feature learning of the GAN-based Scene Synthesis\nmodel, we adopt an online learning strategy that collaboratively learns the\nsynthesized images and original images. Extensive experiments on two widely\nused person search benchmarks, CUHK-SYSU and PRW, have shown that our method\nhas achieved great performance, and the extensive ablation study further\njustifies our GAN-synthetic data can effectively increase the variability of\nthe datasets and be more realistic.\n","authors":["Huibing Wang","Tianxiang Cui","Mingze Yao","Huijuan Pang","Yushan Du"],"pdf_url":"https://arxiv.org/pdf/2308.04322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04321v1","updated":"2023-08-08T15:14:23Z","published":"2023-08-08T15:14:23Z","title":"All-pairs Consistency Learning for Weakly Supervised Semantic\n  Segmentation","summary":"  In this work, we propose a new transformer-based regularization to better\nlocalize objects for Weakly supervised semantic segmentation (WSSS). In\nimage-level WSSS, Class Activation Map (CAM) is adopted to generate object\nlocalization as pseudo segmentation labels. To address the partial activation\nissue of the CAMs, consistency regularization is employed to maintain\nactivation intensity invariance across various image augmentations. However,\nsuch methods ignore pair-wise relations among regions within each CAM, which\ncapture context and should also be invariant across image views. To this end,\nwe propose a new all-pairs consistency regularization (ACR). Given a pair of\naugmented views, our approach regularizes the activation intensities between a\npair of augmented views, while also ensuring that the affinity across regions\nwithin each view remains consistent. We adopt vision transformers as the\nself-attention mechanism naturally embeds pair-wise affinity. This enables us\nto simply regularize the distance between the attention matrices of augmented\nimage pairs. Additionally, we introduce a novel class-wise localization method\nthat leverages the gradients of the class token. Our method can be seamlessly\nintegrated into existing WSSS methods using transformers without modifying the\narchitectures. We evaluate our method on PASCAL VOC and MS COCO datasets. Our\nmethod produces noticeably better class localization maps (67.3% mIoU on PASCAL\nVOC train), resulting in superior WSSS performances.\n","authors":["Weixuan Sun","Yanhao Zhang","Zhen Qin","Zheyuan Liu","Lin Cheng","Fanyi Wang","Yiran Zhong","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2308.04321v1.pdf","comment":"ICCV 2023 workshop"},{"id":"http://arxiv.org/abs/2307.07873v3","updated":"2023-08-08T15:13:22Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n  Transferability From Surrogate Training","summary":"  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v3.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n  pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.02781v2","updated":"2023-08-08T14:54:36Z","published":"2023-08-05T03:21:12Z","title":"A Voting-Stacking Ensemble of Inception Networks for Cervical Cytology\n  Classification","summary":"  Cervical cancer is one of the most severe diseases threatening women's\nhealth. Early detection and diagnosis can significantly reduce cancer risk, in\nwhich cervical cytology classification is indispensable. Researchers have\nrecently designed many networks for automated cervical cancer diagnosis, but\nthe limited accuracy and bulky size of these individual models cannot meet\npractical application needs. To address this issue, we propose a\nVoting-Stacking ensemble strategy, which employs three Inception networks as\nbase learners and integrates their outputs through a voting ensemble. The\nsamples misclassified by the ensemble model generate a new training set on\nwhich a linear classification model is trained as the meta-learner and performs\nthe final predictions. In addition, a multi-level Stacking ensemble framework\nis designed to improve performance further. The method is evaluated on the\nSIPakMed, Herlev, and Mendeley datasets, achieving accuracies of 100%, 100%,\nand 100%, respectively. The experimental results outperform the current\nstate-of-the-art (SOTA) methods, demonstrating its potential for reducing\nscreening workload and helping pathologists detect cervical cancer.\n","authors":["Linyi Qian","Qian Huang","Yulin Chen","Junzhou Chen"],"pdf_url":"https://arxiv.org/pdf/2308.02781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12344v2","updated":"2023-08-08T14:52:39Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.00500v2","updated":"2023-08-08T14:50:50Z","published":"2023-03-01T13:32:55Z","title":"Inherently Interpretable Multi-Label Classification Using Class-Specific\n  Counterfactuals","summary":"  Interpretability is essential for machine learning algorithms in high-stakes\napplication fields such as medical image analysis. However, high-performing\nblack-box neural networks do not provide explanations for their predictions,\nwhich can lead to mistrust and suboptimal human-ML collaboration. Post-hoc\nexplanation techniques, which are widely used in practice, have been shown to\nsuffer from severe conceptual problems. Furthermore, as we show in this paper,\ncurrent explanation techniques do not perform adequately in the multi-label\nscenario, in which multiple medical findings may co-occur in a single image. We\npropose Attri-Net, an inherently interpretable model for multi-label\nclassification. Attri-Net is a powerful classifier that provides transparent,\ntrustworthy, and human-understandable explanations. The model first generates\nclass-specific attribution maps based on counterfactuals to identify which\nimage regions correspond to certain medical findings. Then a simple logistic\nregression classifier is used to make predictions based solely on these\nattribution maps. We compare Attri-Net to five post-hoc explanation techniques\nand one inherently interpretable classifier on three chest X-ray datasets. We\nfind that Attri-Net produces high-quality multi-label explanations consistent\nwith clinical knowledge and has comparable classification performance to\nstate-of-the-art classification models.\n","authors":["Susu Sun","Stefano Woerner","Andreas Maier","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2303.00500v2.pdf","comment":"Accepted to MIDL 2023"},{"id":"http://arxiv.org/abs/2308.04303v1","updated":"2023-08-08T14:49:44Z","published":"2023-08-08T14:49:44Z","title":"Vehicle Motion Forecasting using Prior Information and Semantic-assisted\n  Occupancy Grid Maps","summary":"  Motion prediction is a challenging task for autonomous vehicles due to\nuncertainty in the sensor data, the non-deterministic nature of future, and\ncomplex behavior of agents. In this paper, we tackle this problem by\nrepresenting the scene as dynamic occupancy grid maps (DOGMs), associating\nsemantic labels to the occupied cells and incorporating map information. We\npropose a novel framework that combines deep-learning-based spatio-temporal and\nprobabilistic approaches to predict vehicle behaviors.Contrary to the\nconventional OGM prediction methods, evaluation of our work is conducted\nagainst the ground truth annotations. We experiment and validate our results on\nreal-world NuScenes dataset and show that our model shows superior ability to\npredict both static and dynamic vehicles compared to OGM predictions.\nFurthermore, we perform an ablation study and assess the role of semantic\nlabels and map in the architecture.\n","authors":["Rabbia Asghar","Manuel Diaz-Zapata","Lukas Rummelhard","Anne Spalanzani","Christian Laugier"],"pdf_url":"https://arxiv.org/pdf/2308.04303v1.pdf","comment":"Accepted to the 2023 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2308.04288v1","updated":"2023-08-08T14:32:38Z","published":"2023-08-08T14:32:38Z","title":"Cloth2Tex: A Customized Cloth Texture Generation Pipeline for 3D Virtual\n  Try-On","summary":"  Fabricating and designing 3D garments has become extremely demanding with the\nincreasing need for synthesizing realistic dressed persons for a variety of\napplications, e.g. 3D virtual try-on, digitalization of 2D clothes into 3D\napparel, and cloth animation. It thus necessitates a simple and straightforward\npipeline to obtain high-quality texture from simple input, such as 2D reference\nimages. Since traditional warping-based texture generation methods require a\nsignificant number of control points to be manually selected for each type of\ngarment, which can be a time-consuming and tedious process. We propose a novel\nmethod, called Cloth2Tex, which eliminates the human burden in this process.\nCloth2Tex is a self-supervised method that generates texture maps with\nreasonable layout and structural consistency. Another key feature of Cloth2Tex\nis that it can be used to support high-fidelity texture inpainting. This is\ndone by combining Cloth2Tex with a prevailing latent diffusion model. We\nevaluate our approach both qualitatively and quantitatively and demonstrate\nthat Cloth2Tex can generate high-quality texture maps and achieve the best\nvisual effects in comparison to other methods. Project page:\ntomguluson92.github.io/projects/cloth2tex/\n","authors":["Daiheng Gao","Xu Chen","Xindi Zhang","Qi Wang","Ke Sun","Bang Zhang","Liefeng Bo","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04288v1.pdf","comment":"15 pages, 15 figures"},{"id":"http://arxiv.org/abs/2212.04780v3","updated":"2023-08-08T14:30:05Z","published":"2022-12-09T11:18:40Z","title":"Genie: Show Me the Data for Quantization","summary":"  Zero-shot quantization is a promising approach for developing lightweight\ndeep neural networks when data is inaccessible owing to various reasons,\nincluding cost and issues related to privacy. By exploiting the learned\nparameters ($\\mu$ and $\\sigma$) of batch normalization layers in an\nFP32-pre-trained model, zero-shot quantization schemes focus on generating\nsynthetic data. Subsequently, they distill knowledge from the pre-trained model\n(teacher) to the quantized model (student) such that the quantized model can be\noptimized with the synthetic dataset. However, thus far, zero-shot quantization\nhas primarily been discussed in the context of quantization-aware training\nmethods, which require task-specific losses and long-term optimization as much\nas retraining. We thus introduce a post-training quantization scheme for\nzero-shot quantization that produces high-quality quantized networks within a\nfew hours. Furthermore, we propose a framework called Genie~that generates data\nsuited for quantization. With the data synthesized by Genie, we can produce\nrobust quantized models without real datasets, which is comparable to few-shot\nquantization. We also propose a post-training quantization algorithm to enhance\nthe performance of quantized models. By combining them, we can bridge the gap\nbetween zero-shot and few-shot quantization while significantly improving the\nquantization performance compared to that of existing approaches. In other\nwords, we can obtain a unique state-of-the-art zero-shot quantization approach.\nThe code is available at \\url{https://github.com/SamsungLabs/Genie}.\n","authors":["Yongkweon Jeon","Chungman Lee","Ho-young Kim"],"pdf_url":"https://arxiv.org/pdf/2212.04780v3.pdf","comment":"Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie"},{"id":"http://arxiv.org/abs/2308.04283v1","updated":"2023-08-08T14:25:13Z","published":"2023-08-08T14:25:13Z","title":"Vision-Based Autonomous Navigation for Unmanned Surface Vessel in\n  Extreme Marine Conditions","summary":"  Visual perception is an important component for autonomous navigation of\nunmanned surface vessels (USV), particularly for the tasks related to\nautonomous inspection and tracking. These tasks involve vision-based navigation\ntechniques to identify the target for navigation. Reduced visibility under\nextreme weather conditions in marine environments makes it difficult for\nvision-based approaches to work properly. To overcome these issues, this paper\npresents an autonomous vision-based navigation framework for tracking target\nobjects in extreme marine conditions. The proposed framework consists of an\nintegrated perception pipeline that uses a generative adversarial network (GAN)\nto remove noise and highlight the object features before passing them to the\nobject detector (i.e., YOLOv5). The detected visual features are then used by\nthe USV to track the target. The proposed framework has been thoroughly tested\nin simulation under extremely reduced visibility due to sandstorms and fog. The\nresults are compared with state-of-the-art de-hazing methods across the\nbenchmarked MBZIRC simulation dataset, on which the proposed scheme has\noutperformed the existing methods across various metrics.\n","authors":["Muhayyuddin Ahmed","Ahsan Baidar Bakht","Taimur Hassan","Waseem Akram","Ahmed Humais","Lakmal Seneviratne","Shaoming He","Defu Lin","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2308.04283v1.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots (IROS-2023)"},{"id":"http://arxiv.org/abs/2308.04269v1","updated":"2023-08-08T14:10:16Z","published":"2023-08-08T14:10:16Z","title":"Lossy and Lossless (L$^2$) Post-training Model Size Compression","summary":"  Deep neural networks have delivered remarkable performance and have been\nwidely used in various visual tasks. However, their huge size causes\nsignificant inconvenience for transmission and storage. Many previous studies\nhave explored model size compression. However, these studies often approach\nvarious lossy and lossless compression methods in isolation, leading to\nchallenges in achieving high compression ratios efficiently. This work proposes\na post-training model size compression method that combines lossy and lossless\ncompression in a unified way. We first propose a unified parametric weight\ntransformation, which ensures different lossy compression methods can be\nperformed jointly in a post-training manner. Then, a dedicated differentiable\ncounter is introduced to guide the optimization of lossy compression to arrive\nat a more suitable point for later lossless compression. Additionally, our\nmethod can easily control a desired global compression ratio and allocate\nadaptive ratios for different layers. Finally, our method can achieve a stable\n$10\\times$ compression ratio without sacrificing accuracy and a $20\\times$\ncompression ratio with minor accuracy loss in a short time. Our code is\navailable at https://github.com/ModelTC/L2_Compression .\n","authors":["Yumeng Shi","Shihao Bai","Xiuying Wei","Ruihao Gong","Jianlei Yang"],"pdf_url":"https://arxiv.org/pdf/2308.04269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04262v1","updated":"2023-08-08T13:59:16Z","published":"2023-08-08T13:59:16Z","title":"SDLFormer: A Sparse and Dense Locality-enhanced Transformer for\n  Accelerated MR Image Reconstruction","summary":"  Transformers have emerged as viable alternatives to convolutional neural\nnetworks owing to their ability to learn non-local region relationships in the\nspatial domain. The self-attention mechanism of the transformer enables\ntransformers to capture long-range dependencies in the images, which might be\ndesirable for accelerated MRI image reconstruction as the effect of\nundersampling is non-local in the image domain. Despite its computational\nefficiency, the window-based transformers suffer from restricted receptive\nfields as the dependencies are limited to within the scope of the image\nwindows. We propose a window-based transformer network that integrates dilated\nattention mechanism and convolution for accelerated MRI image reconstruction.\nThe proposed network consists of dilated and dense neighborhood attention\ntransformers to enhance the distant neighborhood pixel relationship and\nintroduce depth-wise convolutions within the transformer module to learn\nlow-level translation invariant features for accelerated MRI image\nreconstruction. The proposed model is trained in a self-supervised manner. We\nperform extensive experiments for multi-coil MRI acceleration for coronal PD,\ncoronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in\nself-supervised learning based on k-space splitting. We compare our method\nagainst other reconstruction architectures and the parallel domain\nself-supervised learning baseline. Results show that the proposed model\nexhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in\nSSIM on average over other architectures (ii) around 1.44 dB in PSNR and around\n0.029 in SSIM over parallel domain self-supervised learning. The code is\navailable at https://github.com/rahul-gs-16/sdlformer.git\n","authors":["Rahul G. S.","Sriprabha Ramnarayanan","Mohammad Al Fahim","Keerthi Ram","Preejith S. P","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.04262v1.pdf","comment":"Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with\n  noisy and Limited Data"},{"id":"http://arxiv.org/abs/2307.11661v2","updated":"2023-08-08T13:44:12Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. The code, prompts, and auxiliary text dataset is\navailable at https://github.com/mayug/VDT-Adapter.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v2.pdf","comment":"Paper accepted at ICCV-W 2023. V2 contains additional comparisons\n  with concurrent works"},{"id":"http://arxiv.org/abs/2308.04252v1","updated":"2023-08-08T13:38:50Z","published":"2023-08-08T13:38:50Z","title":"Blur aware metric depth estimation with multi-focus plenoptic cameras","summary":"  While a traditional camera only captures one point of view of a scene, a\nplenoptic or light-field camera, is able to capture spatial and angular\ninformation in a single snapshot, enabling depth estimation from a single\nacquisition. In this paper, we present a new metric depth estimation algorithm\nusing only raw images from a multi-focus plenoptic camera. The proposed\napproach is especially suited for the multi-focus configuration where several\nmicro-lenses with different focal lengths are used. The main goal of our blur\naware depth estimation (BLADE) approach is to improve disparity estimation for\ndefocus stereo images by integrating both correspondence and defocus cues. We\nthus leverage blur information where it was previously considered a drawback.\nWe explicitly derive an inverse projection model including the defocus blur\nproviding depth estimates up to a scale factor. A method to calibrate the\ninverse model is then proposed. We thus take into account depth scaling to\nachieve precise and accurate metric depth estimates. Our results show that\nintroducing defocus cues improves the depth estimation. We demonstrate the\neffectiveness of our framework and depth scaling calibration on relative depth\nestimation setups and on real-world 3D complex scenes with ground truth\nacquired with a 3D lidar scanner.\n","authors":["Mathieu Labussière","Céline Teulière","Omar Ait-Aider"],"pdf_url":"https://arxiv.org/pdf/2308.04252v1.pdf","comment":"21 pages, 12 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2308.04249v1","updated":"2023-08-08T13:28:34Z","published":"2023-08-08T13:28:34Z","title":"MindDiffuser: Controlled Image Reconstruction from Human Brain Activity\n  with Semantic and Structural Diffusion","summary":"  Reconstructing visual stimuli from brain recordings has been a meaningful and\nchallenging task. Especially, the achievement of precise and controllable image\nreconstruction bears great significance in propelling the progress and\nutilization of brain-computer interfaces. Despite the advancements in complex\nimage reconstruction techniques, the challenge persists in achieving a cohesive\nalignment of both semantic (concepts and objects) and structure (position,\norientation, and size) with the image stimuli. To address the aforementioned\nissue, we propose a two-stage image reconstruction model called MindDiffuser.\nIn Stage 1, the VQ-VAE latent representations and the CLIP text embeddings\ndecoded from fMRI are put into Stable Diffusion, which yields a preliminary\nimage that contains semantic information. In Stage 2, we utilize the CLIP\nvisual feature decoded from fMRI as supervisory information, and continually\nadjust the two feature vectors decoded in Stage 1 through backpropagation to\nalign the structural information. The results of both qualitative and\nquantitative analyses demonstrate that our model has surpassed the current\nstate-of-the-art models on Natural Scenes Dataset (NSD). The subsequent\nexperimental findings corroborate the neurobiological plausibility of the\nmodel, as evidenced by the interpretability of the multimodal feature employed,\nwhich align with the corresponding brain responses.\n","authors":["Yizhuo Lu","Changde Du","Qiongyi zhou","Dianpeng Wang","Huiguang He"],"pdf_url":"https://arxiv.org/pdf/2308.04249v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.14139"},{"id":"http://arxiv.org/abs/2308.04243v1","updated":"2023-08-08T13:17:20Z","published":"2023-08-08T13:17:20Z","title":"AICSD: Adaptive Inter-Class Similarity Distillation for Semantic\n  Segmentation","summary":"  In recent years, deep neural networks have achieved remarkable accuracy in\ncomputer vision tasks. With inference time being a crucial factor, particularly\nin dense prediction tasks such as semantic segmentation, knowledge distillation\nhas emerged as a successful technique for improving the accuracy of lightweight\nstudent networks. The existing methods often neglect the information in\nchannels and among different classes. To overcome these limitations, this paper\nproposes a novel method called Inter-Class Similarity Distillation (ICSD) for\nthe purpose of knowledge distillation. The proposed method transfers high-order\nrelations from the teacher network to the student network by independently\ncomputing intra-class distributions for each class from network outputs. This\nis followed by calculating inter-class similarity matrices for distillation\nusing KL divergence between distributions of each pair of classes. To further\nimprove the effectiveness of the proposed method, an Adaptive Loss Weighting\n(ALW) training strategy is proposed. Unlike existing methods, the ALW strategy\ngradually reduces the influence of the teacher network towards the end of\ntraining process to account for errors in teacher's predictions. Extensive\nexperiments conducted on two well-known datasets for semantic segmentation,\nCityscapes and Pascal VOC 2012, validate the effectiveness of the proposed\nmethod in terms of mIoU and pixel accuracy. The proposed method outperforms\nmost of existing knowledge distillation methods as demonstrated by both\nquantitative and qualitative evaluations. Code is available at:\nhttps://github.com/AmirMansurian/AICSD\n","authors":["Amir M. Mansourian","Rozhan Ahmadi","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2308.04243v1.pdf","comment":"10 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.09724v3","updated":"2023-08-08T13:14:26Z","published":"2023-07-19T02:26:20Z","title":"AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks","summary":"  To deliver the artistic expression of the target style, recent studies\nexploit the attention mechanism owing to its ability to map the local patches\nof the style image to the corresponding patches of the content image. However,\nbecause of the low semantic correspondence between arbitrary content and\nartworks, the attention module repeatedly abuses specific local patches from\nthe style image, resulting in disharmonious and evident repetitive artifacts.\nTo overcome this limitation and accomplish impeccable artistic style transfer,\nwe focus on enhancing the attention mechanism and capturing the rhythm of\npatterns that organize the style. In this paper, we introduce a novel metric,\nnamely pattern repeatability, that quantifies the repetition of patterns in the\nstyle image. Based on the pattern repeatability, we propose Aesthetic\nPattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot\nof local and global style expressions. In addition, we propose a novel\nself-supervisory task to encourage the attention mechanism to learn precise and\nmeaningful semantic correspondence. Lastly, we introduce the patch-wise style\nloss to transfer the elaborate rhythm of local patterns. Through qualitative\nand quantitative evaluations, we verify the reliability of the proposed pattern\nrepeatability that aligns with human perception, and demonstrate the\nsuperiority of the proposed framework.\n","authors":["Kibeom Hong","Seogkyu Jeon","Junsoo Lee","Namhyuk Ahn","Kunhee Kim","Pilhyeon Lee","Daesik Kim","Youngjung Uh","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2307.09724v3.pdf","comment":"Accepted by ICCV 2023. Code is available at this\n  https://github.com/Kibeom-Hong/AesPA-Net"},{"id":"http://arxiv.org/abs/2304.08134v3","updated":"2023-08-08T12:57:36Z","published":"2023-04-17T10:29:26Z","title":"Tackling Face Verification Edge Cases: In-Depth Analysis and\n  Human-Machine Fusion Approach","summary":"  Nowadays, face recognition systems surpass human performance on several\ndatasets. However, there are still edge cases that the machine can't correctly\nclassify. This paper investigates the effect of a combination of machine and\nhuman operators in the face verification task. First, we look closer at the\nedge cases for several state-of-the-art models to discover common datasets'\nchallenging settings. Then, we conduct a study with 60 participants on these\nselected tasks with humans and provide an extensive analysis. Finally, we\ndemonstrate that combining machine and human decisions can further improve the\nperformance of state-of-the-art face verification systems on various benchmark\ndatasets. Code and data are publicly available on GitHub.\n","authors":["Martin Knoche","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2304.08134v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04232v1","updated":"2023-08-08T12:54:05Z","published":"2023-08-08T12:54:05Z","title":"A Comparative Study of Image-to-Image Translation Using GANs for\n  Synthetic Child Race Data","summary":"  The lack of ethnic diversity in data has been a limiting factor of face\nrecognition techniques in the literature. This is particularly the case for\nchildren where data samples are scarce and presents a challenge when seeking to\nadapt machine vision algorithms that are trained on adult data to work on\nchildren. This work proposes the utilization of image-to-image transformation\nto synthesize data of different races and thus adjust the ethnicity of\nchildren's face data. We consider ethnicity as a style and compare three\ndifferent Image-to-Image neural network based methods, specifically pix2pix,\nCycleGAN, and CUT networks to implement Caucasian child data and Asian child\ndata conversion. Experimental validation results on synthetic data demonstrate\nthe feasibility of using image-to-image transformation methods to generate\nvarious synthetic child data samples with broader ethnic diversity.\n","authors":["Wang Yao","Muhammad Ali Farooq","Joseph Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2308.04232v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2308.04224v1","updated":"2023-08-08T12:43:26Z","published":"2023-08-08T12:43:26Z","title":"Will your Doorbell Camera still recognize you as you grow old","summary":"  Robust authentication for low-power consumer devices such as doorbell cameras\nposes a valuable and unique challenge. This work explores the effect of age and\naging on the performance of facial authentication methods. Two public age\ndatasets, AgeDB and Morph-II have been used as baselines in this work. A\nphoto-realistic age transformation method has been employed to augment a set of\nhigh-quality facial images with various age effects. Then the effect of these\nsynthetic aging data on the high-performance deep-learning-based face\nrecognition model is quantified by using various metrics including Receiver\nOperating Characteristic (ROC) curves and match score distributions.\nExperimental results demonstrate that long-term age effects are still a\nsignificant challenge for the state-of-the-art facial authentication method.\n","authors":["Wang Yao","Muhammad Ali Farooq","Joseph Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2308.04224v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2308.04218v1","updated":"2023-08-08T12:30:36Z","published":"2023-08-08T12:30:36Z","title":"AquaSAM: Underwater Image Foreground Segmentation","summary":"  The Segment Anything Model (SAM) has revolutionized natural image\nsegmentation, nevertheless, its performance on underwater images is still\nrestricted. This work presents AquaSAM, the first attempt to extend the success\nof SAM on underwater images with the purpose of creating a versatile method for\nthe segmentation of various underwater targets. To achieve this, we begin by\nclassifying and extracting various labels automatically in SUIM dataset.\nSubsequently, we develop a straightforward fine-tuning method to adapt SAM to\ngeneral foreground underwater image segmentation. Through extensive experiments\ninvolving eight segmentation tasks like human divers, we demonstrate that\nAquaSAM outperforms the default SAM model especially at hard tasks like coral\nreefs. AquaSAM achieves an average Dice Similarity Coefficient (DSC) of 7.13\n(%) improvement and an average of 8.27 (%) on mIoU improvement in underwater\nsegmentation tasks.\n","authors":["Muduo Xu","Jianhao Su","Yutao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04207v1","updated":"2023-08-08T12:17:02Z","published":"2023-08-08T12:17:02Z","title":"Robust retrieval of material chemical states in X-ray microspectroscopy","summary":"  X-ray microspectroscopic techniques are essential for studying morphological\nand chemical changes in materials, providing high-resolution structural and\nspectroscopic information. However, its practical data analysis for reliably\nretrieving the chemical states remains a major obstacle to accelerating the\nfundamental understanding of materials in many research fields. In this work,\nwe propose a novel data formulation model for X-ray microspectroscopy and\ndevelop a dedicated unmixing framework to solve this problem, which is robust\nto noise and spectral variability. Moreover, this framework is not limited to\nthe analysis of two-state material chemistry, making it an effective\nalternative to conventional and widely-used methods. In addition, an\nalternative directional multiplier method with provable convergence is applied\nto obtain the solution efficiently. Our framework can accurately identify and\ncharacterize chemical states in complex and heterogeneous samples, even under\nchallenging conditions such as low signal-to-noise ratios and overlapping\nspectral features. Extensive experimental results on simulated and real\ndatasets demonstrate its effectiveness and reliability.\n","authors":["Ting Wang","Xiaotong Wu","Jizhou Li","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04207v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.04206v1","updated":"2023-08-08T12:12:30Z","published":"2023-08-08T12:12:30Z","title":"Exploring Transformers for Open-world Instance Segmentation","summary":"  Open-world instance segmentation is a rising task, which aims to segment all\nobjects in the image by learning from a limited number of base-category\nobjects. This task is challenging, as the number of unseen categories could be\nhundreds of times larger than that of seen categories. Recently, the DETR-like\nmodels have been extensively studied in the closed world while stay unexplored\nin the open world. In this paper, we utilize the Transformer for open-world\ninstance segmentation and present SWORD. Firstly, we introduce to attach the\nstop-gradient operation before classification head and further add IoU heads\nfor discovering novel objects. We demonstrate that a simple stop-gradient\noperation not only prevents the novel objects from being suppressed as\nbackground, but also allows the network to enjoy the merit of heuristic label\nassignment. Secondly, we propose a novel contrastive learning framework to\nenlarge the representations between objects and background. Specifically, we\nmaintain a universal object queue to obtain the object center, and dynamically\nselect positive and negative samples from the object queries for contrastive\nlearning. While the previous works only focus on pursuing average recall and\nneglect average precision, we show the prominence of SWORD by giving\nconsideration to both criteria. Our models achieve state-of-the-art performance\nin various open-world cross-category and cross-dataset generalizations.\nParticularly, in VOC to non-VOC setup, our method sets new state-of-the-art\nresults of 40.0% on ARb100 and 34.9% on ARm100. For COCO to UVO generalization,\nSWORD significantly outperforms the previous best open-world model by 5.9% on\nAPm and 8.1% on ARm100.\n","authors":["Jiannan Wu","Yi Jiang","Bin Yan","Huchuan Lu","Zehuan Yuan","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.04206v1.pdf","comment":"Accepted by ICCV2023. 16 pages"},{"id":"http://arxiv.org/abs/2302.00290v2","updated":"2023-08-08T11:59:25Z","published":"2023-02-01T07:45:10Z","title":"MS-DETR: Multispectral Pedestrian Detection Transformer with Loosely\n  Coupled Fusion and Modality-Balanced Optimization","summary":"  Multispectral pedestrian detection is an important task for many\naround-the-clock applications, since the visible and thermal modalities can\nprovide complementary information especially under low light conditions. Most\nof the available multispectral pedestrian detectors are based on non-end-to-end\ndetectors, while in this paper, we propose MultiSpectral pedestrian DEtection\nTRansformer (MS-DETR), an end-to-end multispectral pedestrian detector, which\nextends DETR into the field of multi-modal detection. MS-DETR consists of two\nmodality-specific backbones and Transformer encoders, followed by a multi-modal\nTransformer decoder, and the visible and thermal features are fused in the\nmulti-modal Transformer decoder. To well resist the misalignment between\nmulti-modal images, we design a loosely coupled fusion strategy by sparsely\nsampling some keypoints from multi-modal features independently and fusing them\nwith adaptively learned attention weights. Moreover, based on the insight that\nnot only different modalities, but also different pedestrian instances tend to\nhave different confidence scores to final detection, we further propose an\ninstance-aware modality-balanced optimization strategy, which preserves visible\nand thermal decoder branches and aligns their predicted slots through an\ninstance-wise dynamic loss. Our end-to-end MS-DETR shows superior performance\non the challenging KAIST, CVC-14 and LLVIP benchmark datasets. The source code\nis available at https://github.com/YinghuiXing/MS-DETR .\n","authors":["Yinghui Xing","Song Wang","Shizhou Zhang","Guoqiang Liang","Xiuwei Zhang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.00290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04197v1","updated":"2023-08-08T11:49:04Z","published":"2023-08-08T11:49:04Z","title":"D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with\n  Glance Annotation","summary":"  Temporal sentence grounding (TSG) aims to locate a specific moment from an\nuntrimmed video with a given natural language query. Recently, weakly\nsupervised methods still have a large performance gap compared to fully\nsupervised ones, while the latter requires laborious timestamp annotations. In\nthis study, we aim to reduce the annotation cost yet keep competitive\nperformance for TSG task compared to fully supervised ones. To achieve this\ngoal, we investigate a recently proposed glance-supervised temporal sentence\ngrounding task, which requires only single frame annotation (referred to as\nglance annotation) for each query. Under this setup, we propose a Dynamic\nGaussian prior based Grounding framework with Glance annotation (D3G), which\nconsists of a Semantic Alignment Group Contrastive Learning module (SA-GCL) and\na Dynamic Gaussian prior Adjustment module (DGA). Specifically, SA-GCL samples\nreliable positive moments from a 2D temporal map via jointly leveraging\nGaussian prior and semantic consistency, which contributes to aligning the\npositive sentence-moment pairs in the joint embedding space. Moreover, to\nalleviate the annotation bias resulting from glance annotation and model\ncomplex queries consisting of multiple events, we propose the DGA module, which\nadjusts the distribution dynamically to approximate the ground truth of target\nmoments. Extensive experiments on three challenging benchmarks verify the\neffectiveness of the proposed D3G. It outperforms the state-of-the-art weakly\nsupervised methods by a large margin and narrows the performance gap compared\nto fully supervised methods. Code is available at\nhttps://github.com/solicucu/D3G.\n","authors":["Hanjun Li","Xiujun Shu","Sunan He","Ruizhi Qiao","Wei Wen","Taian Guo","Bei Gan","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04197v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.09788v2","updated":"2023-08-08T11:36:26Z","published":"2023-07-19T07:11:45Z","title":"Density-invariant Features for Distant Point Cloud Registration","summary":"  Registration of distant outdoor LiDAR point clouds is crucial to extending\nthe 3D vision of collaborative autonomous vehicles, and yet is challenging due\nto small overlapping area and a huge disparity between observed point\ndensities. In this paper, we propose Group-wise Contrastive Learning (GCL)\nscheme to extract density-invariant geometric features to register distant\noutdoor LiDAR point clouds. We mark through theoretical analysis and\nexperiments that, contrastive positives should be independent and identically\ndistributed (i.i.d.), in order to train densityinvariant feature extractors. We\npropose upon the conclusion a simple yet effective training scheme to force the\nfeature of multiple point clouds in the same spatial location (referred to as\npositive groups) to be similar, which naturally avoids the sampling bias\nintroduced by a pair of point clouds to conform with the i.i.d. principle. The\nresulting fully-convolutional feature extractor is more powerful and\ndensity-invariant than state-of-the-art methods, improving the registration\nrecall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and\n26.9%, respectively. Code is available at https://github.com/liuQuan98/GCL.\n","authors":["Quan Liu","Hongzi Zhu","Yunsong Zhou","Hongyang Li","Shan Chang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2307.09788v2.pdf","comment":"In Proceedings of the IEEE/CVF International Conference on Computer\n  Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.04188v1","updated":"2023-08-08T11:23:56Z","published":"2023-08-08T11:23:56Z","title":"Image Copy-Move Forgery Detection via Deep Cross-Scale PatchMatch","summary":"  The recently developed deep algorithms achieve promising progress in the\nfield of image copy-move forgery detection (CMFD). However, they have limited\ngeneralizability in some practical scenarios, where the copy-move objects may\nnot appear in the training images or cloned regions are from the background. To\naddress the above issues, in this work, we propose a novel end-to-end CMFD\nframework by integrating merits from both conventional and deep methods.\nSpecifically, we design a deep cross-scale patchmatch method tailored for CMFD\nto localize copy-move regions. In contrast to existing deep models, our scheme\naims to seek explicit and reliable point-to-point matching between source and\ntarget regions using features extracted from high-resolution scales. Further,\nwe develop a manipulation region location branch for source/target separation.\nThe proposed CMFD framework is completely differentiable and can be trained in\nan end-to-end manner. Extensive experimental results demonstrate the high\ngeneralizability of our method to different copy-move contents, and the\nproposed scheme achieves significantly better performance than existing\napproaches.\n","authors":["Yingjie He","Yuanman Li","Changsheng Chen","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2308.04188v1.pdf","comment":"6 pages, 4 figures, accepted by ICME2023"},{"id":"http://arxiv.org/abs/2209.14915v2","updated":"2023-08-08T10:30:54Z","published":"2022-09-29T16:22:46Z","title":"Spiking Neural Networks for event-based action recognition: A new task\n  to understand their advantage","summary":"  Spiking Neural Networks (SNN) are characterised by their unique temporal\ndynamics, but the properties and advantages of such computations are still not\nwell understood. In order to provide answers, in this work we demonstrate how\nSpiking neurons can enable temporal feature extraction in feed-forward neural\nnetworks without the need for recurrent synapses, showing how their\nbio-inspired computing principles can be successfully exploited beyond energy\nefficiency gains and evidencing their differences with respect to conventional\nneurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain\n(DVS-GC), which allows, for the first time, to evaluate the perception of\ntemporal dependencies in a real event-based action recognition dataset. Our\nstudy proves how the widely used DVS Gesture benchmark could be solved by\nnetworks without temporal feature extraction, unlike the new DVS-GC which\ndemands an understanding of the ordering of the events. Furthermore, this setup\nallowed us to unveil the role of the leakage rate in spiking neurons for\ntemporal processing tasks and demonstrated the benefits of \"hard reset\"\nmechanisms. Additionally, we also show how time-dependent weights and\nnormalization can lead to understanding order by means of temporal attention.\n","authors":["Alex Vicente-Sola","Davide L. Manna","Paul Kirkland","Gaetano Di Caterina","Trevor Bihl"],"pdf_url":"https://arxiv.org/pdf/2209.14915v2.pdf","comment":"New article superseding the one in previous versions"},{"id":"http://arxiv.org/abs/2308.04177v1","updated":"2023-08-08T10:30:34Z","published":"2023-08-08T10:30:34Z","title":"How Generalizable are Deepfake Detectors? An Empirical Study","summary":"  Deepfake videos and images are becoming increasingly credible, posing a\nsignificant threat given their potential to facilitate fraud or bypass access\ncontrol systems. This has motivated the development of deepfake detection\nmethods, in which deep learning models are trained to distinguish between real\nand synthesized footage. Unfortunately, existing detection models struggle to\ngeneralize to deepfakes from datasets they were not trained on, but little work\nhas been done to examine why or how this limitation can be addressed. In this\npaper, we present the first empirical study on the generalizability of deepfake\ndetectors, an essential goal for detectors to stay one step ahead of attackers.\nOur study utilizes six deepfake datasets, five deepfake detection methods, and\ntwo model augmentation approaches, confirming that detectors do not generalize\nin zero-shot settings. Additionally, we find that detectors are learning\nunwanted properties specific to synthesis methods and struggling to extract\ndiscriminative features, limiting their ability to generalize. Finally, we find\nthat there are neurons universally contributing to detection across seen and\nunseen datasets, illuminating a possible path forward to zero-shot\ngeneralizability.\n","authors":["Boquan Li","Jun Sun","Christopher M. Poskitt"],"pdf_url":"https://arxiv.org/pdf/2308.04177v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2301.10227v2","updated":"2023-08-08T10:18:04Z","published":"2023-01-02T14:17:08Z","title":"Denoising Diffusion Probabilistic Models for Generation of Realistic\n  Fully-Annotated Microscopy Image Data Sets","summary":"  Recent advances in computer vision have led to significant progress in the\ngeneration of realistic image data, with denoising diffusion probabilistic\nmodels proving to be a particularly effective method. In this study, we\ndemonstrate that diffusion models can effectively generate fully-annotated\nmicroscopy image data sets through an unsupervised and intuitive approach,\nusing rough sketches of desired structures as the starting point. The proposed\npipeline helps to reduce the reliance on manual annotations when training deep\nlearning-based segmentation approaches and enables the segmentation of diverse\ndatasets without the need for human annotations. This approach holds great\npromise in streamlining the data generation process and enabling a more\nefficient and scalable training of segmentation models, as we show in the\nexample of different practical experiments involving various organisms and cell\ntypes.\n","authors":["Dennis Eschweiler","Rüveyda Yilmaz","Matisse Baumann","Ina Laube","Rijo Roy","Abin Jose","Daniel Brückner","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2301.10227v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.05609v4","updated":"2023-08-08T10:04:14Z","published":"2023-01-13T15:24:40Z","title":"Co-manipulation of soft-materials estimating deformation from depth\n  images","summary":"  Human-robot co-manipulation of soft materials, such as fabrics, composites,\nand sheets of paper/cardboard, is a challenging operation that presents several\nrelevant industrial applications. Estimating the deformation state of the\nco-manipulated material is one of the main challenges. Viable methods provide\nthe indirect measure by calculating the human-robot relative distance. In this\npaper, we develop a data-driven model to estimate the deformation state of the\nmaterial from a depth image through a Convolutional Neural Network (CNN).\nFirst, we define the deformation state of the material as the relative\nroto-translation from the current robot pose and a human grasping position. The\nmodel estimates the current deformation state through a Convolutional Neural\nNetwork, specifically a DenseNet-121 pretrained on ImageNet.The delta between\nthe current and the desired deformation state is fed to the robot controller\nthat outputs twist commands. The paper describes the developed approach to\nacquire, preprocess the dataset and train the model. The model is compared with\nthe current state-of-the-art method based on a skeletal tracker from cameras.\nResults show that our approach achieves better performances and avoids the\nvarious drawbacks caused by using a skeletal tracker.Finally, we also studied\nthe model performance according to different architectures and dataset\ndimensions to minimize the time required for dataset acquisition\n","authors":["Giorgio Nicola","Enrico Villagrossi","Nicola Pedrocchi"],"pdf_url":"https://arxiv.org/pdf/2301.05609v4.pdf","comment":"Pre-print, Accepted to Robotics and Computer Integrated Manufacturing"},{"id":"http://arxiv.org/abs/2308.04168v1","updated":"2023-08-08T09:58:22Z","published":"2023-08-08T09:58:22Z","title":"EFaR 2023: Efficient Face Recognition Competition","summary":"  This paper presents the summary of the Efficient Face Recognition Competition\n(EFaR) held at the 2023 International Joint Conference on Biometrics (IJCB\n2023). The competition received 17 submissions from 6 different teams. To drive\nfurther development of efficient face recognition models, the submitted\nsolutions are ranked based on a weighted score of the achieved verification\naccuracies on a diverse set of benchmarks, as well as the deployability given\nby the number of floating-point operations and model size. The evaluation of\nsubmissions is extended to bias, cross-quality, and large-scale recognition\nbenchmarks. Overall, the paper gives an overview of the achieved performance\nvalues of the submitted solutions as well as a diverse set of baselines. The\nsubmitted solutions use small, efficient network architectures to reduce the\ncomputational cost, some solutions apply model quantization. An outlook on\npossible techniques that are underrepresented in current solutions is given as\nwell.\n","authors":["Jan Niklas Kolf","Fadi Boutros","Jurek Elliesen","Markus Theuerkauf","Naser Damer","Mohamad Alansari","Oussama Abdul Hay","Sara Alansari","Sajid Javed","Naoufel Werghi","Klemen Grm","Vitomir Štruc","Fernando Alonso-Fernandez","Kevin Hernandez Diaz","Josef Bigun","Anjith George","Christophe Ecabert","Hatef Otroshi Shahreza","Ketan Kotwal","Sébastien Marcel","Iurii Medvedev","Bo Jin","Diogo Nunes","Ahmad Hassanpour","Pankaj Khatiwada","Aafan Ahmad Toor","Bian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.04168v1.pdf","comment":"Accepted at IJCB 2023"},{"id":"http://arxiv.org/abs/2308.04163v1","updated":"2023-08-08T09:50:44Z","published":"2023-08-08T09:50:44Z","title":"Under-Display Camera Image Restoration with Scattering Effect","summary":"  The under-display camera (UDC) provides consumers with a full-screen visual\nexperience without any obstruction due to notches or punched holes. However,\nthe semi-transparent nature of the display inevitably introduces the severe\ndegradation into UDC images. In this work, we address the UDC image restoration\nproblem with the specific consideration of the scattering effect caused by the\ndisplay. We explicitly model the scattering effect by treating the display as a\npiece of homogeneous scattering medium. With the physical model of the\nscattering effect, we improve the image formation pipeline for the image\nsynthesis to construct a realistic UDC dataset with ground truths. To suppress\nthe scattering effect for the eventual UDC image recovery, a two-branch\nrestoration network is designed. More specifically, the scattering branch\nleverages global modeling capabilities of the channel-wise self-attention to\nestimate parameters of the scattering effect from degraded images. While the\nimage branch exploits the local representation advantage of CNN to recover\nclear scenes, implicitly guided by the scattering branch. Extensive experiments\nare conducted on both real-world and synthesized data, demonstrating the\nsuperiority of the proposed method over the state-of-the-art UDC restoration\ntechniques. The source code and dataset are available at\n\\url{https://github.com/NamecantbeNULL/SRUDC}.\n","authors":["Binbin Song","Xiangyu Chen","Shuning Xu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.04163v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.04162v1","updated":"2023-08-08T09:48:00Z","published":"2023-08-08T09:48:00Z","title":"EPCFormer: Expression Prompt Collaboration Transformer for Universal\n  Referring Video Object Segmentation","summary":"  Audio-guided Video Object Segmentation (A-VOS) and Referring Video Object\nSegmentation (R-VOS) are two highly-related tasks, which both aim to segment\nspecific objects from video sequences according to user-provided expression\nprompts. However, due to the challenges in modeling representations for\ndifferent modalities, contemporary methods struggle to strike a balance between\ninteraction flexibility and high-precision localization and segmentation. In\nthis paper, we address this problem from two perspectives: the alignment\nrepresentation of audio and text and the deep interaction among audio, text,\nand visual features. First, we propose a universal architecture, the Expression\nPrompt Collaboration Transformer, herein EPCFormer. Next, we propose an\nExpression Alignment (EA) mechanism for audio and text expressions. By\nintroducing contrastive learning for audio and text expressions, the proposed\nEPCFormer realizes comprehension of the semantic equivalence between audio and\ntext expressions denoting the same objects. Then, to facilitate deep\ninteractions among audio, text, and video features, we introduce an\nExpression-Visual Attention (EVA) mechanism. The knowledge of video object\nsegmentation in terms of the expression prompts can seamlessly transfer between\nthe two tasks by deeply exploring complementary cues between text and audio.\nExperiments on well-recognized benchmarks demonstrate that our universal\nEPCFormer attains state-of-the-art results on both tasks. The source code of\nEPCFormer will be made publicly available at\nhttps://github.com/lab206/EPCFormer.\n","authors":["Jiajun Chen","Jiacheng Lin","Zhiqiang Xiao","Haolong Fu","Ke Nai","Kailun Yang","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2308.04162v1.pdf","comment":"The source code will be made publicly available at\n  https://github.com/lab206/EPCFormer"},{"id":"http://arxiv.org/abs/2306.10046v2","updated":"2023-08-08T09:46:21Z","published":"2023-06-12T08:21:50Z","title":"Document Layout Annotation: Database and Benchmark in the Domain of\n  Public Affairs","summary":"  Every day, thousands of digital documents are generated with useful\ninformation for companies, public organizations, and citizens. Given the\nimpossibility of processing them manually, the automatic processing of these\ndocuments is becoming increasingly necessary in certain sectors. However, this\ntask remains challenging, since in most cases a text-only based parsing is not\nenough to fully understand the information presented through different\ncomponents of varying significance. In this regard, Document Layout Analysis\n(DLA) has been an interesting research field for many years, which aims to\ndetect and classify the basic components of a document. In this work, we used a\nprocedure to semi-automatically annotate digital documents with different\nlayout labels, including 4 basic layout blocks and 4 text categories. We apply\nthis procedure to collect a novel database for DLA in the public affairs\ndomain, using a set of 24 data sources from the Spanish Administration. The\ndatabase comprises 37.9K documents with more than 441K document pages, and more\nthan 8M labels associated to 8 layout block units. The results of our\nexperiments validate the proposed text labeling procedure with accuracy up to\n99%.\n","authors":["Alejandro Peña","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia","Marcos Grande","Iñigo Puente","Jorge Cordova","Gonzalo Cordova"],"pdf_url":"https://arxiv.org/pdf/2306.10046v2.pdf","comment":"Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for\n  Document Analysis"},{"id":"http://arxiv.org/abs/2308.04156v1","updated":"2023-08-08T09:37:18Z","published":"2023-08-08T09:37:18Z","title":"Towards Top-Down Stereoscopic Image Quality Assessment via Stereo\n  Attention","summary":"  Stereoscopic image quality assessment (SIQA) plays a crucial role in\nevaluating and improving the visual experience of 3D content. Existing\nbinocular properties and attention-based methods for SIQA have achieved\npromising performance. However, these bottom-up approaches are inadequate in\nexploiting the inherent characteristics of the human visual system (HVS). This\npaper presents a novel network for SIQA via stereo attention, employing a\ntop-down perspective to guide the quality assessment process. Our proposed\nmethod realizes the guidance from high-level binocular signals down to\nlow-level monocular signals, while the binocular and monocular information can\nbe calibrated progressively throughout the processing pipeline. We design a\ngeneralized Stereo AttenTion (SAT) block to implement the top-down philosophy\nin stereo perception. This block utilizes the fusion-generated attention map as\na high-level binocular modulator, influencing the representation of two\nlow-level monocular features. Additionally, we introduce an Energy Coefficient\n(EC) to account for recent findings indicating that binocular responses in the\nprimate primary visual cortex are less than the sum of monocular responses. The\nadaptive EC can tune the magnitude of binocular response flexibly, thus\nenhancing the formation of robust binocular features within our framework. To\nextract the most discriminative quality information from the summation and\nsubtraction of the two branches of monocular features, we utilize a\ndual-pooling strategy that applies min-pooling and max-pooling operations to\nthe respective branches. Experimental results highlight the superiority of our\ntop-down method in simulating the property of visual perception and advancing\nthe state-of-the-art in the SIQA field. The code of this work is available at\nhttps://github.com/Fanning-Zhang/SATNet.\n","authors":["Huilin Zhang","Sumei Li","Yongli Chang"],"pdf_url":"https://arxiv.org/pdf/2308.04156v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04152v1","updated":"2023-08-08T09:32:43Z","published":"2023-08-08T09:32:43Z","title":"Empowering Vision-Language Models to Follow Interleaved Vision-Language\n  Instructions","summary":"  Multimodal Large Language Models (MLLMs) have recently sparked significant\ninterest, which demonstrates emergent capabilities to serve as a\ngeneral-purpose model for various vision-language tasks. However, existing\nmethods mainly focus on limited types of instructions with a single image as\nvisual context, which hinders the widespread availability of MLLMs. In this\npaper, we introduce the I4 benchmark to comprehensively evaluate the\ninstruction following ability on complicated interleaved vision-language\ninstructions, which involve intricate image-text sequential context, covering a\ndiverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture\nslides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a\ncommon defect of existing methods: the Visual Prompt Generator (VPG) trained on\nimage-captioning alignment objective tends to attend to common foreground\ninformation for captioning but struggles to extract specific information\nrequired by particular tasks. To address this issue, we propose a generic and\nlightweight controllable knowledge re-injection module, which utilizes the\nsophisticated reasoning ability of LLMs to control the VPG to conditionally\nextract instruction-specific visual information and re-inject it into the LLM.\nFurther, we introduce an annotation-free cross-attention guided counterfactual\nimage training strategy to methodically learn the proposed module by\ncollaborating a cascade of foundation models. Enhanced by the proposed module\nand training strategy, we present Cheetah, a MLLM that can effectively handle a\nwide variety of interleaved vision-language instructions and achieves\nstate-of-the-art zero-shot performance across all tasks of I4, without\nhigh-quality multimodal instruction tuning data. Moreover, Cheetah also\nexhibits competitive performance compared with state-of-the-art instruction\ntuned models on concurrent MME benchmark.\n","authors":["Juncheng Li","Kaihang Pan","Zhiqi Ge","Minghe Gao","Hanwang Zhang","Wei Ji","Wenqiao Zhang","Tat-Seng Chua","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.04152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04151v1","updated":"2023-08-08T09:32:15Z","published":"2023-08-08T09:32:15Z","title":"Application for White Spot Syndrome Virus (WSSV) Monitoring using Edge\n  Machine Learning","summary":"  The aquaculture industry, strongly reliant on shrimp exports, faces\nchallenges due to viral infections like the White Spot Syndrome Virus (WSSV)\nthat severely impact output yields. In this context, computer vision can play a\nsignificant role in identifying features not immediately evident to skilled or\nuntrained eyes, potentially reducing the time required to report WSSV\ninfections. In this study, the challenge of limited data for WSSV recognition\nwas addressed. A mobile application dedicated to data collection and monitoring\nwas developed to facilitate the creation of an image dataset to train a WSSV\nrecognition model and improve country-wide disease surveillance. The study also\nincludes a thorough analysis of WSSV recognition to address the challenge of\nimbalanced learning and on-device inference. The models explored,\nMobileNetV3-Small and EfficientNetV2-B0, gained an F1-Score of 0.72 and 0.99\nrespectively. The saliency heatmaps of both models were also observed to\nuncover the \"black-box\" nature of these models and to gain insight as to what\nfeatures in the images are most important in making a prediction. These results\nhighlight the effectiveness and limitations of using models designed for\nresource-constrained devices and balancing their performance in accurately\nrecognizing WSSV, providing valuable information and direction in the use of\ncomputer vision in this domain.\n","authors":["Lorenzo S. Querol","Macario O. Cordel II","Dan Jeric A. Rustia","Mary Nia M. Santos"],"pdf_url":"https://arxiv.org/pdf/2308.04151v1.pdf","comment":"6 pages, 7 figures, conference"},{"id":"http://arxiv.org/abs/2308.02632v2","updated":"2023-08-08T09:21:40Z","published":"2023-08-04T17:44:27Z","title":"Generation of Realistic Synthetic Raw Radar Data for Automated Driving\n  Applications using Generative Adversarial Networks","summary":"  The main approaches for simulating FMCW radar are based on ray tracing, which\nis usually computationally intensive and do not account for background noise.\nThis work proposes a faster method for FMCW radar simulation capable of\ngenerating synthetic raw radar data using generative adversarial networks\n(GAN). The code and pre-trained weights are open-source and available on\nGitHub. This method generates 16 simultaneous chirps, which allows the\ngenerated data to be used for the further development of algorithms for\nprocessing radar data (filtering and clustering). This can increase the\npotential for data augmentation, e.g., by generating data in non-existent or\nsafety-critical scenarios that are not reproducible in real life. In this work,\nthe GAN was trained with radar measurements of a motorcycle and used to\ngenerate synthetic raw radar data of a motorcycle traveling in a straight line.\nFor generating this data, the distance of the motorcycle and Gaussian noise are\nused as input to the neural network. The synthetic generated radar chirps were\nevaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth\n(RA) map is calculated twice: first, based on synthetic data using this GAN\nand, second, based on real data. Based on these RA maps, an algorithm with\nadaptive threshold and edge detection is used for object detection. The results\nhave shown that the data is realistic in terms of coherent radar reflections of\nthe motorcycle and background noise based on the comparison of chirps, the RA\nmaps and the object detection results. Thus, the proposed method in this work\nhas shown to minimize the simulation-to-reality gap for the generation of radar\ndata.\n","authors":["Eduardo C. Fidelis","Fabio Reway","Herick Y. S. Ribeiro","Pietro L. Campos","Werner Huber","Christian Icking","Lester A. Faria","Torsten Schön"],"pdf_url":"https://arxiv.org/pdf/2308.02632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2002.03729v3","updated":"2023-08-08T09:18:57Z","published":"2020-01-16T09:38:50Z","title":"A lightweight target detection algorithm based on Mobilenet Convolution","summary":"  Target detection algorithm based on deep learning needs high computer GPU\nconfiguration, even need to use high performance deep learning workstation,\nthis not only makes the cost increase, also greatly limits the realizability of\nthe ground, this paper introduces a kind of lightweight algorithm for target\ndetection under the condition of the balance accuracy and computational\nefficiency, MobileNet as Backbone performs parameter The processing speed is\n30fps on the RTX2060 card for images with the CNN separator layer. The\nprocessing speed is 30fps on the RTX2060 card for images with a resolution of\n320*320.\n","authors":["Shengquan Wang"],"pdf_url":"https://arxiv.org/pdf/2002.03729v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04142v1","updated":"2023-08-08T09:03:46Z","published":"2023-08-08T09:03:46Z","title":"Class-level Structural Relation Modelling and Smoothing for Visual\n  Representation Learning","summary":"  Representation learning for images has been advanced by recent progress in\nmore complex neural models such as the Vision Transformers and new learning\ntheories such as the structural causal models. However, these models mainly\nrely on the classification loss to implicitly regularize the class-level data\ndistributions, and they may face difficulties when handling classes with\ndiverse visual patterns. We argue that the incorporation of the structural\ninformation between data samples may improve this situation. To achieve this\ngoal, this paper presents a framework termed \\textbf{C}lass-level Structural\nRelation Modeling and Smoothing for Visual Representation Learning (CSRMS),\nwhich includes the Class-level Relation Modelling, Class-aware Graph Sampling,\nand Relational Graph-Guided Representation Learning modules to model a\nrelational graph of the entire dataset and perform class-aware smoothing and\nregularization operations to alleviate the issue of intra-class visual\ndiversity and inter-class similarity. Specifically, the Class-level Relation\nModelling module uses a clustering algorithm to learn the data distributions in\nthe feature space and identify three types of class-level sample relations for\nthe training set; Class-aware Graph Sampling module extends typical training\nbatch construction process with three strategies to sample dataset-level\nsub-graphs; and Relational Graph-Guided Representation Learning module employs\na graph convolution network with knowledge-guided smoothing operations to ease\nthe projection from different visual patterns to the same class. Experiments\ndemonstrate the effectiveness of structured knowledge modelling for enhanced\nrepresentation learning and show that CSRMS can be incorporated with any\nstate-of-the-art visual representation learning models for performance gains.\nThe source codes and demos have been released at\nhttps://github.com/czt117/CSRMS.\n","authors":["Zitan Chen","Zhuang Qi","Xiao Cao","Xiangxian Li","Xiangxu Meng","Lei Meng"],"pdf_url":"https://arxiv.org/pdf/2308.04142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04137v1","updated":"2023-08-08T08:50:27Z","published":"2023-08-08T08:50:27Z","title":"Comprehensive Assessment of the Performance of Deep Learning Classifiers\n  Reveals a Surprising Lack of Robustness","summary":"  Reliable and robust evaluation methods are a necessary first step towards\ndeveloping machine learning models that are themselves robust and reliable.\nUnfortunately, current evaluation protocols typically used to assess\nclassifiers fail to comprehensively evaluate performance as they tend to rely\non limited types of test data, and ignore others. For example, using the\nstandard test data fails to evaluate the predictions made by the classifier to\nsamples from classes it was not trained on. On the other hand, testing with\ndata containing samples from unknown classes fails to evaluate how well the\nclassifier can predict the labels for known classes. This article advocates\nbench-marking performance using a wide range of different types of data and\nusing a single metric that can be applied to all such data types to produce a\nconsistent evaluation of performance. Using such a benchmark it is found that\ncurrent deep neural networks, including those trained with methods that are\nbelieved to produce state-of-the-art robustness, are extremely vulnerable to\nmaking mistakes on certain types of data. This means that such models will be\nunreliable in real-world scenarios where they may encounter data from many\ndifferent domains, and that they are insecure as they can easily be fooled into\nmaking the wrong decisions. It is hoped that these results will motivate the\nwider adoption of more comprehensive testing methods that will, in turn, lead\nto the development of more robust machine learning methods in the future.\n  Code is available at:\n\\url{https://codeberg.org/mwspratling/RobustnessEvaluation}\n","authors":["Michael W. Spratling"],"pdf_url":"https://arxiv.org/pdf/2308.04137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18651v3","updated":"2023-08-08T08:48:48Z","published":"2023-05-29T23:06:05Z","title":"UMD: Unsupervised Model Detection for X2X Backdoor Attacks","summary":"  Backdoor (Trojan) attack is a common threat to deep neural networks, where\nsamples from one or more source classes embedded with a backdoor trigger will\nbe misclassified to adversarial target classes. Existing methods for detecting\nwhether a classifier is backdoor attacked are mostly designed for attacks with\na single adversarial target (e.g., all-to-one attack). To the best of our\nknowledge, without supervision, no existing methods can effectively address the\nmore general X2X attack with an arbitrary number of source classes, each paired\nwith an arbitrary target class. In this paper, we propose UMD, the first\nUnsupervised Model Detection method that effectively detects X2X backdoor\nattacks via a joint inference of the adversarial (source, target) class pairs.\nIn particular, we first define a novel transferability statistic to measure and\nselect a subset of putative backdoor class pairs based on a proposed clustering\napproach. Then, these selected class pairs are jointly assessed based on an\naggregation of their reverse-engineered trigger size for detection inference,\nusing a robust and unsupervised anomaly detector we proposed. We conduct\ncomprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show\nthat our unsupervised UMD outperforms SOTA detectors (even with supervision) by\n17%, 4%, and 8%, respectively, in terms of the detection accuracy against\ndiverse X2X attacks. We also show the strong detection performance of UMD\nagainst several strong adaptive attacks.\n","authors":["Zhen Xiang","Zidi Xiong","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2305.18651v3.pdf","comment":"Proceedings of the 40th International Conference on Machine Learning"},{"id":"http://arxiv.org/abs/2308.04126v1","updated":"2023-08-08T08:30:16Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n  and Infinite Data Generation","summary":"  This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text. Our\ncrafted algorithm leverages advancements across multiple operations such as\nvideo/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models. Future\nprospects include optimizing datasets for each modality to encourage unlimited\ndata generation. This robust base will offer priceless insights to models like\nChatGPT, enabling them to create higher quality datasets for video captioning\nand easing question-answering tasks based on video content. OmniDataComposer\ninaugurates a new stage in multimodal learning, imparting enormous potential\nfor augmenting AI's understanding and generation of complex, real-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.14508v3","updated":"2023-08-08T08:29:12Z","published":"2022-08-30T19:32:07Z","title":"Swin-transformer-yolov5 For Real-time Wine Grape Bunch Detection","summary":"  In this research, an integrated detection model, Swin-transformer-YOLOv5 or\nSwin-T-YOLOv5, was proposed for real-time wine grape bunch detection to inherit\nthe advantages from both YOLOv5 and Swin-transformer. The research was\nconducted on two different grape varieties of Chardonnay (always white berry\nskin) and Merlot (white or white-red mix berry skin when immature; red when\nmatured) from July to September in 2019. To verify the superiority of\nSwin-T-YOLOv5, its performance was compared against several commonly\nused/competitive object detectors, including Faster R-CNN, YOLOv3, YOLOv4, and\nYOLOv5. All models were assessed under different test conditions, including two\ndifferent weather conditions (sunny and cloudy), two different berry maturity\nstages (immature and mature), and three different sunlight\ndirections/intensities (morning, noon, and afternoon) for a comprehensive\ncomparison. Additionally, the predicted number of grape bunches by\nSwin-T-YOLOv5 was further compared with ground truth values, including both\nin-field manual counting and manual labeling during the annotation process.\nResults showed that the proposed Swin-T-YOLOv5 outperformed all other studied\nmodels for grape bunch detection, with up to 97% of mean Average Precision\n(mAP) and 0.89 of F1-score when the weather was cloudy. This mAP was\napproximately 44%, 18%, 14%, and 4% greater than Faster R-CNN, YOLOv3, YOLOv4,\nand YOLOv5, respectively. Swin-T-YOLOv5 achieved its lowest mAP (90%) and\nF1-score (0.82) when detecting immature berries, where the mAP was\napproximately 40%, 5%, 3%, and 1% greater than the same. Furthermore,\nSwin-T-YOLOv5 performed better on Chardonnay variety with achieved up to 0.91\nof R2 and 2.36 root mean square error (RMSE) when comparing the predictions\nwith ground truth. However, it underperformed on Merlot variety with achieved\nonly up to 0.70 of R2 and 3.30 of RMSE.\n","authors":["Shenglian Lu","Xiaoyu Liu","Zixaun He","Wenbo Liu","Xin Zhang","Manoj Karkee"],"pdf_url":"https://arxiv.org/pdf/2208.14508v3.pdf","comment":"30 pages; 15 figures;Corresponding author: Xin Zhang Department of\n  Agricultural and Biological Engineering Mississippi State University\n  Mississippi State, MS 39762, USA (xzhang@abe.msstate.edu)"},{"id":"http://arxiv.org/abs/2301.11514v4","updated":"2023-08-08T08:26:20Z","published":"2023-01-27T03:18:09Z","title":"Deep Industrial Image Anomaly Detection: A Survey","summary":"  The recent rapid development of deep learning has laid a milestone in\nindustrial Image Anomaly Detection (IAD). In this paper, we provide a\ncomprehensive review of deep learning-based image anomaly detection techniques,\nfrom the perspectives of neural network architectures, levels of supervision,\nloss functions, metrics and datasets. In addition, we extract the new setting\nfrom industrial manufacturing and review the current IAD approaches under our\nproposed our new setting. Moreover, we highlight several opening challenges for\nimage anomaly detection. The merits and downsides of representative network\narchitectures under varying supervision are discussed. Finally, we summarize\nthe research findings and point out future research directions. More resources\nare available at\nhttps://github.com/M-3LAB/awesome-industrial-anomaly-detection.\n","authors":["Jiaqi Liu","Guoyang Xie","Jingbao Wang","Shangnian Li","Chengjie Wang","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.11514v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04118v1","updated":"2023-08-08T08:17:39Z","published":"2023-08-08T08:17:39Z","title":"Multimodal Color Recommendation in Vector Graphic Documents","summary":"  Color selection plays a critical role in graphic document design and requires\nsufficient consideration of various contexts. However, recommending appropriate\ncolors which harmonize with the other colors and textual contexts in documents\nis a challenging task, even for experienced designers. In this study, we\npropose a multimodal masked color model that integrates both color and textual\ncontexts to provide text-aware color recommendation for graphic documents. Our\nproposed model comprises self-attention networks to capture the relationships\nbetween colors in multiple palettes, and cross-attention networks that\nincorporate both color and CLIP-based text representations. Our proposed method\nprimarily focuses on color palette completion, which recommends colors based on\nthe given colors and text. Additionally, it is applicable for another color\nrecommendation task, full palette generation, which generates a complete color\npalette corresponding to the given text. Experimental results demonstrate that\nour proposed approach surpasses previous color palette completion methods on\naccuracy, color distribution, and user experience, as well as full palette\ngeneration methods concerning color diversity and similarity to the ground\ntruth palettes.\n","authors":["Qianru Qiu","Xueting Wang","Mayu Otani"],"pdf_url":"https://arxiv.org/pdf/2308.04118v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.06209v2","updated":"2023-08-08T08:06:48Z","published":"2023-03-10T21:17:14Z","title":"SemARFlow: Injecting Semantics into Unsupervised Optical Flow Estimation\n  for Autonomous Driving","summary":"  Unsupervised optical flow estimation is especially hard near occlusions and\nmotion boundaries and in low-texture regions. We show that additional\ninformation such as semantics and domain knowledge can help better constrain\nthis problem. We introduce SemARFlow, an unsupervised optical flow network\ndesigned for autonomous driving data that takes estimated semantic segmentation\nmasks as additional inputs. This additional information is injected into the\nencoder and into a learned upsampler that refines the flow output. In addition,\na simple yet effective semantic augmentation module provides self-supervision\nwhen learning flow and its boundaries for vehicles, poles, and sky. Together,\nthese injections of semantic information improve the KITTI-2015 optical flow\ntest error rate from 11.80% to 8.38%. We also show visible improvements around\nobject boundaries as well as a greater ability to generalize across datasets.\nCode is available at\nhttps://github.com/duke-vision/semantic-unsup-flow-release.\n","authors":["Shuai Yuan","Shuzhi Yu","Hannah Kim","Carlo Tomasi"],"pdf_url":"https://arxiv.org/pdf/2303.06209v2.pdf","comment":"Accepted by ICCV-2023; Code is available at\n  https://github.com/duke-vision/semantic-unsup-flow-release"},{"id":"http://arxiv.org/abs/2307.14016v3","updated":"2023-08-08T07:57:15Z","published":"2023-07-26T07:57:56Z","title":"RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition","summary":"  Palmprint recently shows great potential in recognition applications as it is\na privacy-friendly and stable biometric. However, the lack of large-scale\npublic palmprint datasets limits further research and development of palmprint\nrecognition. In this paper, we propose a novel realistic pseudo-palmprint\ngeneration (RPG) model to synthesize palmprints with massive identities. We\nfirst introduce a conditional modulation generator to improve the intra-class\ndiversity. Then an identity-aware loss is proposed to ensure identity\nconsistency against unpaired training. We further improve the B\\'ezier palm\ncreases generation strategy to guarantee identity independence. Extensive\nexperimental results demonstrate that synthetic pretraining significantly\nboosts the recognition model performance. For example, our model improves the\nstate-of-the-art B\\'ezierPalm by more than $5\\%$ and $14\\%$ in terms of\nTAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only\n$10\\%$ of the real training data, our method still outperforms ArcFace with\n$100\\%$ real training data, indicating that we are closer to real-data-free\npalmprint recognition.\n","authors":["Lei Shen","Jianlong Jin","Ruixin Zhang","Huaen Li","Kai Zhao","Yingyi Zhang","Jingyun Zhang","Shouhong Ding","Yang Zhao","Wei Jia"],"pdf_url":"https://arxiv.org/pdf/2307.14016v3.pdf","comment":"12 pages,8 figures"},{"id":"http://arxiv.org/abs/2308.03463v2","updated":"2023-08-08T07:54:55Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao"],"pdf_url":"https://arxiv.org/pdf/2308.03463v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2202.04680v2","updated":"2023-08-08T07:36:57Z","published":"2022-02-09T19:03:05Z","title":"Lifting-based variational multiclass segmentation: design, analysis and\n  implementation","summary":"  We propose, analyze and realize a variational multiclass segmentation scheme\nthat partitions a given image into multiple regions exhibiting specific\nproperties. Our method determines multiple functions that encode the\nsegmentation regions by minimizing an energy functional combining information\nfrom different channels. Multichannel image data can be obtained by lifting the\nimage into a higher dimensional feature space using specific multichannel\nfiltering or may already be provided by the imaging modality under\nconsideration, such as an RGB image or multimodal medical data. Experimental\nresults show that the proposed method performs well in various scenarios. In\nparticular, promising results are presented for two medical applications\ninvolving classification of brain abscess and tumor growth, respectively. As\nmain theoretical contributions, we prove the existence of global minimizers of\nthe proposed energy functional and show its stability and convergence with\nrespect to noisy inputs. In particular, these results also apply to the special\ncase of binary segmentation, and these results are also novel in this\nparticular situation.\n","authors":["Nadja Gruber","Johannes Schwab","Sebastien Court","Elke Gizewski","Markus Haltmeier"],"pdf_url":"https://arxiv.org/pdf/2202.04680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04091v1","updated":"2023-08-08T07:15:23Z","published":"2023-08-08T07:15:23Z","title":"From Unimodal to Multimodal: improving the sEMG-Based Pattern\n  Recognition via deep generative models","summary":"  Multimodal hand gesture recognition (HGR) systems can achieve higher\nrecognition accuracy. However, acquiring multimodal gesture recognition data\ntypically requires users to wear additional sensors, thereby increasing\nhardware costs. This paper proposes a novel generative approach to improve\nSurface Electromyography (sEMG)-based HGR accuracy via virtual Inertial\nMeasurement Unit (IMU) signals. Specifically, we trained a deep generative\nmodel based on the intrinsic correlation between forearm sEMG signals and\nforearm IMU signals to generate virtual forearm IMU signals from the input\nforearm sEMG signals at first. Subsequently, the sEMG signals and virtual IMU\nsignals were fed into a multimodal Convolutional Neural Network (CNN) model for\ngesture recognition. To evaluate the performance of the proposed approach, we\nconducted experiments on 6 databases, including 5 publicly available databases\nand our collected database comprising 28 subjects performing 38 gestures,\ncontaining both sEMG and IMU data. The results show that our proposed approach\noutperforms the sEMG-based unimodal HGR method (with increases of\n2.15%-13.10%). It demonstrates that incorporating virtual IMU signals,\ngenerated by deep generative models, can significantly enhance the accuracy of\nsEMG-based HGR. The proposed approach represents a successful attempt to\ntransition from unimodal HGR to multimodal HGR without additional sensor\nhardware.\n","authors":["Wentao Wei","Linyan Ren"],"pdf_url":"https://arxiv.org/pdf/2308.04091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09880v3","updated":"2023-08-08T07:02:16Z","published":"2023-05-17T01:27:27Z","title":"A survey of the Vision Transformers and its CNN-Transformer based\n  Variants","summary":"  Vision transformers have become popular as a possible substitute to\nconvolutional neural networks (CNNs) for a variety of computer vision\napplications. These transformers, with their ability to focus on global\nrelationships in images, offer large learning capacity. However, they may\nsuffer from limited generalization as they do not tend to model local\ncorrelation in images. Recently, in vision transformers hybridization of both\nthe convolution operation and self-attention mechanism has emerged, to exploit\nboth the local and global image representations. These hybrid vision\ntransformers, also referred to as CNN-Transformer architectures, have\ndemonstrated remarkable results in vision applications. Given the rapidly\ngrowing number of hybrid vision transformers, it has become necessary to\nprovide a taxonomy and explanation of these hybrid architectures. This survey\npresents a taxonomy of the recent vision transformer architectures and more\nspecifically that of the hybrid vision transformers. Additionally, the key\nfeatures of these architectures such as the attention mechanisms, positional\nembeddings, multi-scale processing, and convolution are also discussed. In\ncontrast to the previous survey papers that are primarily focused on individual\nvision transformer architectures or CNNs, this survey uniquely emphasizes the\nemerging trend of hybrid vision transformers. By showcasing the potential of\nhybrid vision transformers to deliver exceptional performance across a range of\ncomputer vision tasks, this survey sheds light on the future directions of this\nrapidly evolving architecture.\n","authors":["Asifullah Khan","Zunaira Rauf","Anabia Sohail","Abdul Rehman","Hifsa Asif","Aqsa Asif","Umair Farooq"],"pdf_url":"https://arxiv.org/pdf/2305.09880v3.pdf","comment":"Pages: 58, Figures: 14"},{"id":"http://arxiv.org/abs/2308.01006v3","updated":"2023-08-08T06:45:25Z","published":"2023-08-02T08:29:44Z","title":"FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of\n  Autonomous Driving","summary":"  Building a multi-modality multi-task neural network toward accurate and\nrobust performance is a de-facto standard in perception task of autonomous\ndriving. However, leveraging such data from multiple sensors to jointly\noptimize the prediction and planning tasks remains largely unexplored. In this\npaper, we present FusionAD, to the best of our knowledge, the first unified\nframework that fuse the information from two most critical sensors, camera and\nLiDAR, goes beyond perception task. Concretely, we first build a transformer\nbased multi-modality fusion network to effectively produce fusion based\nfeatures. In constrast to camera-based end-to-end method UniAD, we then\nestablish a fusion aided modality-aware prediction and status-aware planning\nmodules, dubbed FMSPnP that take advantages of multi-modality features. We\nconduct extensive experiments on commonly used benchmark nuScenes dataset, our\nFusionAD achieves state-of-the-art performance and surpassing baselines on\naverage 15% on perception tasks like detection and tracking, 10% on occupancy\nprediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score\nand reduces the collision rate from 0.31% to only 0.12%.\n","authors":["Tengju Ye","Wei Jing","Chunyong Hu","Shikun Huang","Lingping Gao","Fangzhen Li","Jingke Wang","Ke Guo","Wencong Xiao","Weibo Mao","Hang Zheng","Kun Li","Junbo Chen","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01006v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04079v1","updated":"2023-08-08T06:37:06Z","published":"2023-08-08T06:37:06Z","title":"3D Gaussian Splatting for Real-Time Radiance Field Rendering","summary":"  Radiance Field methods have recently revolutionized novel-view synthesis of\nscenes captured with multiple photos or videos. However, achieving high visual\nquality still requires neural networks that are costly to train and render,\nwhile recent faster methods inevitably trade off speed for quality. For\nunbounded and complete scenes (rather than isolated objects) and 1080p\nresolution rendering, no current method can achieve real-time display rates. We\nintroduce three key elements that allow us to achieve state-of-the-art visual\nquality while maintaining competitive training times and importantly allow\nhigh-quality real-time (>= 30 fps) novel-view synthesis at 1080p resolution.\nFirst, starting from sparse points produced during camera calibration, we\nrepresent the scene with 3D Gaussians that preserve desirable properties of\ncontinuous volumetric radiance fields for scene optimization while avoiding\nunnecessary computation in empty space; Second, we perform interleaved\noptimization/density control of the 3D Gaussians, notably optimizing\nanisotropic covariance to achieve an accurate representation of the scene;\nThird, we develop a fast visibility-aware rendering algorithm that supports\nanisotropic splatting and both accelerates training and allows realtime\nrendering. We demonstrate state-of-the-art visual quality and real-time\nrendering on several established datasets.\n","authors":["Bernhard Kerbl","Georgios Kopanas","Thomas Leimkühler","George Drettakis"],"pdf_url":"https://arxiv.org/pdf/2308.04079v1.pdf","comment":"https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/"},{"id":"http://arxiv.org/abs/2308.04074v1","updated":"2023-08-08T06:16:37Z","published":"2023-08-08T06:16:37Z","title":"Exploiting Spatial-Temporal Context for Interacting Hand Reconstruction\n  on Monocular RGB Video","summary":"  Reconstructing interacting hands from monocular RGB data is a challenging\ntask, as it involves many interfering factors, e.g. self- and mutual occlusion\nand similar textures. Previous works only leverage information from a single\nRGB image without modeling their physically plausible relation, which leads to\ninferior reconstruction results. In this work, we are dedicated to explicitly\nexploiting spatial-temporal information to achieve better interacting hand\nreconstruction. On one hand, we leverage temporal context to complement\ninsufficient information provided by the single frame, and design a novel\ntemporal framework with a temporal constraint for interacting hand motion\nsmoothness. On the other hand, we further propose an interpenetration detection\nmodule to produce kinetically plausible interacting hands without physical\ncollisions. Extensive experiments are performed to validate the effectiveness\nof our proposed framework, which achieves new state-of-the-art performance on\npublic benchmarks.\n","authors":["Weichao Zhao","Hezhen Hu","Wengang Zhou","Li li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.04074v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.04070v1","updated":"2023-08-08T06:07:49Z","published":"2023-08-08T06:07:49Z","title":"ConDistFL: Conditional Distillation for Federated Learning from\n  Partially Annotated Data","summary":"  Developing a generalized segmentation model capable of simultaneously\ndelineating multiple organs and diseases is highly desirable. Federated\nlearning (FL) is a key technology enabling the collaborative development of a\nmodel without exchanging training data. However, the limited access to fully\nannotated training data poses a major challenge to training generalizable\nmodels. We propose \"ConDistFL\", a framework to solve this problem by combining\nFL with knowledge distillation. Local models can extract the knowledge of\nunlabeled organs and tumors from partially annotated data from the global model\nwith an adequately designed conditional probability representation. We validate\nour framework on four distinct partially annotated abdominal CT datasets from\nthe MSD and KiTS19 challenges. The experimental results show that the proposed\nframework significantly outperforms FedAvg and FedOpt baselines. Moreover, the\nperformance on an external test dataset demonstrates superior generalizability\ncompared to models trained on each dataset separately. Our ablation study\nsuggests that ConDistFL can perform well without frequent aggregation, reducing\nthe communication cost of FL. Our implementation will be available at\nhttps://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.\n","authors":["Pochuan Wang","Chen Shen","Weichung Wang","Masahiro Oda","Chiou-Shann Fuh","Kensaku Mori","Holger R. Roth"],"pdf_url":"https://arxiv.org/pdf/2308.04070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05785v2","updated":"2023-08-08T06:06:35Z","published":"2022-09-13T07:37:53Z","title":"Adversarial Coreset Selection for Efficient Robust Training","summary":"  Neural networks are vulnerable to adversarial attacks: adding well-crafted,\nimperceptible perturbations to their input can modify their output. Adversarial\ntraining is one of the most effective approaches to training robust models\nagainst such attacks. Unfortunately, this method is much slower than vanilla\ntraining of neural networks since it needs to construct adversarial examples\nfor the entire training data at every iteration. By leveraging the theory of\ncoreset selection, we show how selecting a small subset of training data\nprovides a principled approach to reducing the time complexity of robust\ntraining. To this end, we first provide convergence guarantees for adversarial\ncoreset selection. In particular, we show that the convergence bound is\ndirectly related to how well our coresets can approximate the gradient computed\nover the entire training data. Motivated by our theoretical analysis, we\npropose using this gradient approximation error as our adversarial coreset\nselection objective to reduce the training set size effectively. Once built, we\nrun adversarial training over this subset of the training data. Unlike existing\nmethods, our approach can be adapted to a wide variety of training objectives,\nincluding TRADES, $\\ell_p$-PGD, and Perceptual Adversarial Training. We conduct\nextensive experiments to demonstrate that our approach speeds up adversarial\ntraining by 2-3 times while experiencing a slight degradation in the clean and\nrobust accuracy.\n","authors":["Hadi M. Dolatabadi","Sarah Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2209.05785v2.pdf","comment":"Accepted to the International Journal of Computer Vision (IJCV).\n  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:\n  substantial text overlap with arXiv:2112.00378"},{"id":"http://arxiv.org/abs/2305.01160v3","updated":"2023-08-08T05:59:58Z","published":"2023-05-02T02:29:18Z","title":"Long-Tailed Recognition by Mutual Information Maximization between\n  Latent Features and Ground-Truth Labels","summary":"  Although contrastive learning methods have shown prevailing performance on a\nvariety of representation learning tasks, they encounter difficulty when the\ntraining dataset is long-tailed. Many researchers have combined contrastive\nlearning and a logit adjustment technique to address this problem, but the\ncombinations are done ad-hoc and a theoretical background has not yet been\nprovided. The goal of this paper is to provide the background and further\nimprove the performance. First, we show that the fundamental reason contrastive\nlearning methods struggle with long-tailed tasks is that they try to maximize\nthe mutual information maximization between latent features and input data. As\nground-truth labels are not considered in the maximization, they are not able\nto address imbalances between class labels. Rather, we interpret the\nlong-tailed recognition task as a mutual information maximization between\nlatent features and ground-truth labels. This approach integrates contrastive\nlearning and logit adjustment seamlessly to derive a loss function that shows\nstate-of-the-art performance on long-tailed recognition benchmarks. It also\ndemonstrates its efficacy in image segmentation tasks, verifying its\nversatility beyond image classification.\n","authors":["Min-Kook Suh","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2305.01160v3.pdf","comment":"ICML 2023 camera-ready"},{"id":"http://arxiv.org/abs/2308.03529v2","updated":"2023-08-08T05:29:57Z","published":"2023-08-07T12:26:34Z","title":"Feature Decoupling-Recycling Network for Fast Interactive Segmentation","summary":"  Recent interactive segmentation methods iteratively take source image, user\nguidance and previously predicted mask as the input without considering the\ninvariant nature of the source image. As a result, extracting features from the\nsource image is repeated in each interaction, resulting in substantial\ncomputational redundancy. In this work, we propose the Feature\nDecoupling-Recycling Network (FDRN), which decouples the modeling components\nbased on their intrinsic discrepancies and then recycles components for each\nuser interaction. Thus, the efficiency of the whole interactive process can be\nsignificantly improved. To be specific, we apply the Decoupling-Recycling\nstrategy from three perspectives to address three types of discrepancies,\nrespectively. First, our model decouples the learning of source image semantics\nfrom the encoding of user guidance to process two types of input domains\nseparately. Second, FDRN decouples high-level and low-level features from\nstratified semantic representations to enhance feature learning. Third, during\nthe encoding of user guidance, current user guidance is decoupled from\nhistorical guidance to highlight the effect of current user guidance. We\nconduct extensive experiments on 6 datasets from different domains and\nmodalities, which demonstrate the following merits of our model: 1) superior\nefficiency than other methods, particularly advantageous in challenging\nscenarios requiring long-term interactions (up to 4.25x faster), while\nachieving favorable segmentation performance; 2) strong applicability to\nvarious methods serving as a universal enhancement technique; 3) well\ncross-task generalizability, e.g., to medical image segmentation, and\nrobustness against misleading user guidance.\n","authors":["Huimin Zeng","Weinong Wang","Xin Tao","Zhiwei Xiong","Yu-Wing Tai","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03529v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04054v1","updated":"2023-08-08T05:29:26Z","published":"2023-08-08T05:29:26Z","title":"An Empirical Analysis of Range for 3D Object Detection","summary":"  LiDAR-based 3D detection plays a vital role in autonomous navigation.\nSurprisingly, although autonomous vehicles (AVs) must detect both near-field\nobjects (for collision avoidance) and far-field objects (for longer-term\nplanning), contemporary benchmarks focus only on near-field 3D detection.\nHowever, AVs must detect far-field objects for safe navigation. In this paper,\nwe present an empirical analysis of far-field 3D detection using the long-range\ndetection dataset Argoverse 2.0 to better understand the problem, and share the\nfollowing insight: near-field LiDAR measurements are dense and optimally\nencoded by small voxels, while far-field measurements are sparse and are better\nencoded with large voxels. We exploit this observation to build a collection of\nrange experts tuned for near-vs-far field detection, and propose simple\ntechniques to efficiently ensemble models for long-range detection that improve\nefficiency by 33% and boost accuracy by 3.2% CDS.\n","authors":["Neehar Peri","Mengtian Li","Benjamin Wilson","Yu-Xiong Wang","James Hays","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2308.04054v1.pdf","comment":"Accepted to ICCV 2023 Workshop - Robustness and Reliability of\n  Autonomous Vehicles in the Open-World"},{"id":"http://arxiv.org/abs/2308.03177v2","updated":"2023-08-08T05:26:45Z","published":"2023-08-06T18:07:45Z","title":"Boosting Few-shot 3D Point Cloud Segmentation via Query-Guided\n  Enhancement","summary":"  Although extensive research has been conducted on 3D point cloud\nsegmentation, effectively adapting generic models to novel categories remains a\nformidable challenge. This paper proposes a novel approach to improve point\ncloud few-shot segmentation (PC-FSS) models. Unlike existing PC-FSS methods\nthat directly utilize categorical information from support prototypes to\nrecognize novel classes in query samples, our method identifies two critical\naspects that substantially enhance model performance by reducing contextual\ngaps between support prototypes and query features. Specifically, we (1) adapt\nsupport background prototypes to match query context while removing extraneous\ncues that may obscure foreground and background in query samples, and (2)\nholistically rectify support prototypes under the guidance of query features to\nemulate the latter having no semantic gap to the query targets. Our proposed\ndesigns are agnostic to the feature extractor, rendering them readily\napplicable to any prototype-based methods. The experimental results on S3DIS\nand ScanNet demonstrate notable practical benefits, as our approach achieves\nsignificant improvements while still maintaining high efficiency. The code for\nour approach is available at\nhttps://github.com/AaronNZH/Boosting-Few-shot-3D-Point-Cloud-Segmentation-via-Query-Guided-Enhancement\n","authors":["Zhenhua Ning","Zhuotao Tian","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.03177v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04052v1","updated":"2023-08-08T05:16:51Z","published":"2023-08-08T05:16:51Z","title":"The Five-Dollar Model: Generating Game Maps and Sprites from Sentence\n  Embeddings","summary":"  The five-dollar model is a lightweight text-to-image generative architecture\nthat generates low dimensional images from an encoded text prompt. This model\ncan successfully generate accurate and aesthetically pleasing content in low\ndimensional domains, with limited amounts of training data. Despite the small\nsize of both the model and datasets, the generated images are still able to\nmaintain the encoded semantic meaning of the textual prompt. We apply this\nmodel to three small datasets: pixel art video game maps, video game sprite\nimages, and down-scaled emoji images and apply novel augmentation strategies to\nimprove the performance of our model on these limited datasets. We evaluate our\nmodels performance using cosine similarity score between text-image pairs\ngenerated by the CLIP VIT-B/32 model.\n","authors":["Timothy Merino","Roman Negri","Dipika Rajesh","M Charity","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.04052v1.pdf","comment":"to be published in AIIDE 2023"},{"id":"http://arxiv.org/abs/2306.16670v3","updated":"2023-08-08T05:00:58Z","published":"2023-06-29T04:05:13Z","title":"End-to-End Learnable Multi-Scale Feature Compression for VCM","summary":"  The proliferation of deep learning-based machine vision applications has\ngiven rise to a new type of compression, so called video coding for machine\n(VCM). VCM differs from traditional video coding in that it is optimized for\nmachine vision performance instead of human visual quality. In the feature\ncompression track of MPEG-VCM, multi-scale features extracted from images are\nsubject to compression. Recent feature compression works have demonstrated that\nthe versatile video coding (VVC) standard-based approach can achieve a BD-rate\nreduction of up to 96% against MPEG-VCM feature anchor. However, it is still\nsub-optimal as VVC was not designed for extracted features but for natural\nimages. Moreover, the high encoding complexity of VVC makes it difficult to\ndesign a lightweight encoder without sacrificing performance. To address these\nchallenges, we propose a novel multi-scale feature compression method that\nenables both the end-to-end optimization on the extracted features and the\ndesign of lightweight encoders. The proposed model combines a learnable\ncompressor with a multi-scale feature fusion network so that the redundancy in\nthe multi-scale features is effectively removed. Instead of simply cascading\nthe fusion network and the compression network, we integrate the fusion and\nencoding processes in an interleaved way. Our model first encodes a\nlarger-scale feature to obtain a latent representation and then fuses the\nlatent with a smaller-scale feature. This process is successively performed\nuntil the smallest-scale feature is fused and then the encoded latent at the\nfinal stage is entropy-coded for transmission. The results show that our model\noutperforms previous approaches by at least 52% BD-rate reduction and has\n$\\times5$ to $\\times27$ times less encoding time for object detection...\n","authors":["Yeongwoong Kim","Hyewon Jeong","Janghyun Yu","Younhee Kim","Jooyoung Lee","Se Yoon Jeong","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2306.16670v3.pdf","comment":"13 pages, accepted by IEEE Transactions on Circuits and Systems for\n  Video Technology"},{"id":"http://arxiv.org/abs/2308.04047v1","updated":"2023-08-08T04:53:52Z","published":"2023-08-08T04:53:52Z","title":"SODFormer: Streaming Object Detection with Transformer Using Events and\n  Frames","summary":"  DAVIS camera, streaming two complementary sensing modalities of asynchronous\nevents and frames, has gradually been used to address major object detection\nchallenges (e.g., fast motion blur and low-light). However, how to effectively\nleverage rich temporal cues and fuse two heterogeneous visual streams remains a\nchallenging endeavor. To address this challenge, we propose a novel streaming\nobject detector with Transformer, namely SODFormer, which first integrates\nevents and frames to continuously detect objects in an asynchronous manner.\nTechnically, we first build a large-scale multimodal neuromorphic object\ndetection dataset (i.e., PKU-DAVIS-SOD) over 1080.1k manual labels. Then, we\ndesign a spatiotemporal Transformer architecture to detect objects via an\nend-to-end sequence prediction problem, where the novel temporal Transformer\nmodule leverages rich temporal cues from two visual streams to improve the\ndetection performance. Finally, an asynchronous attention-based fusion module\nis proposed to integrate two heterogeneous sensing modalities and take\ncomplementary advantages from each end, which can be queried at any time to\nlocate objects and break through the limited output frequency from synchronized\nframe-based fusion strategies. The results show that the proposed SODFormer\noutperforms four state-of-the-art methods and our eight baselines by a\nsignificant margin. We also show that our unifying framework works well even in\ncases where the conventional frame-based camera fails, e.g., high-speed motion\nand low-light conditions. Our dataset and code can be available at\nhttps://github.com/dianzl/SODFormer.\n","authors":["Dianze Li","Jianing Li","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04047v1.pdf","comment":"18 pages, 15 figures, in IEEE Transactions on Pattern Analysis and\n  Machine Intelligence"},{"id":"http://arxiv.org/abs/2308.04039v1","updated":"2023-08-08T04:30:42Z","published":"2023-08-08T04:30:42Z","title":"Implicit neural representations for joint decomposition and registration\n  of gene expression images in the marmoset brain","summary":"  We propose a novel image registration method based on implicit neural\nrepresentations that addresses the challenging problem of registering a pair of\nbrain images with similar anatomical structures, but where one image contains\nadditional features or artifacts that are not present in the other image. To\ndemonstrate its effectiveness, we use 2D microscopy $\\textit{in situ}$\nhybridization gene expression images of the marmoset brain. Accurately\nquantifying gene expression requires image registration to a brain template,\nwhich is difficult due to the diversity of patterns causing variations in\nvisible anatomical brain structures. Our approach uses implicit networks in\ncombination with an image exclusion loss to jointly perform the registration\nand decompose the image into a support and residual image. The support image\naligns well with the template, while the residual image captures individual\nimage characteristics that diverge from the template. In experiments, our\nmethod provided excellent results and outperformed other registration\ntechniques.\n","authors":["Michal Byra","Charissa Poon","Tomomi Shimogori","Henrik Skibbe"],"pdf_url":"https://arxiv.org/pdf/2308.04039v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2201.01615v3","updated":"2023-08-08T04:17:55Z","published":"2022-01-05T13:51:20Z","title":"Lawin Transformer: Improving New-Era Vision Backbones with Multi-Scale\n  Representations for Semantic Segmentation","summary":"  The multi-level aggregation (MLA) module has emerged as a critical component\nfor advancing new-era vision back-bones in semantic segmentation. In this\npaper, we propose Lawin (large window) Transformer, a novel MLA architecture\nthat creatively utilizes multi-scale feature maps from the vision backbone. At\nthe core of Lawin Transformer is the Lawin attention, a newly designed window\nattention mechanism capable of querying much larger context windows than local\nwindows. We focus on studying the efficient and simplistic application of the\nlarge-window paradigm, allowing for flexible regulation of the ratio of large\ncontext to query and capturing multi-scale representations. We validate the\neffectiveness of Lawin Transformer on Cityscapes and ADE20K, consistently\ndemonstrating great superiority to widely-used MLA modules when combined with\nnew-era vision backbones. The code is available at\nhttps://github.com/yan-hao-tian/lawin.\n","authors":["Haotian Yan","Chuang Zhang","Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2201.01615v3.pdf","comment":"The latest version has really big differences from the original\n  version, which may make the reader confused. We will submit the latest\n  version as another article"},{"id":"http://arxiv.org/abs/2308.03698v2","updated":"2023-08-08T03:40:53Z","published":"2023-08-07T16:14:27Z","title":"Screen-based 3D Subjective Experiment Software","summary":"  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn\nconsiderable efforts from academia and industry to assess their perceptual\nquality by conducting subjective experiments. However, lacking a handy software\nfor 3D subjective experiments complicates the construction of 3D graphics\nquality assessment datasets, thus hindering the prosperity of relevant fields.\nIn this paper, we develop a powerful platform with which users can flexibly\ndesign their 3D subjective methodologies and build high-quality datasets,\neasing a broad spectrum of 3D graphics subjective quality study. To accurately\nillustrate the perceptual quality differences of 3D stimuli, our software can\nsimultaneously render the source stimulus and impaired stimulus and allows both\nstimuli to respond synchronously to viewer interactions. Compared with amateur\n3D visualization tool-based or image/video rendering-based schemes, our\napproach embodies typical 3D applications while minimizing cognitive overload\nduring subjective experiments. We organized a subjective experiment involving\n40 participants to verify the validity of the proposed software. Experimental\nanalyses demonstrate that subjective tests on our software can produce\nreasonable subjective quality scores of 3D models. All resources in this paper\ncan be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.\n","authors":["Songlin Fan","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2308.03698v2.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04020v1","updated":"2023-08-08T03:34:04Z","published":"2023-08-08T03:34:04Z","title":"Synthetic Augmentation with Large-scale Unconditional Pre-training","summary":"  Deep learning based medical image recognition systems often require a\nsubstantial amount of training data with expert annotations, which can be\nexpensive and time-consuming to obtain. Recently, synthetic augmentation\ntechniques have been proposed to mitigate the issue by generating realistic\nimages conditioned on class labels. However, the effectiveness of these methods\nheavily depends on the representation capability of the trained generative\nmodel, which cannot be guaranteed without sufficient labeled training data. To\nfurther reduce the dependency on annotated data, we propose a synthetic\naugmentation method called HistoDiffusion, which can be pre-trained on\nlarge-scale unlabeled datasets and later applied to a small-scale labeled\ndataset for augmented training. In particular, we train a latent diffusion\nmodel (LDM) on diverse unlabeled datasets to learn common features and generate\nrealistic images without conditional inputs. Then, we fine-tune the model with\nclassifier guidance in latent space on an unseen labeled dataset so that the\nmodel can synthesize images of specific categories. Additionally, we adopt a\nselective mechanism to only add synthetic samples with high confidence of\nmatching to target labels. We evaluate our proposed method by pre-training on\nthree histopathology datasets and testing on a histopathology dataset of\ncolorectal cancer (CRC) excluded from the pre-training datasets. With\nHistoDiffusion augmentation, the classification accuracy of a backbone\nclassifier is remarkably improved by 6.4% using a small set of the original\nlabels. Our code is available at https://github.com/karenyyy/HistoDiffAug.\n","authors":["Jiarong Ye","Haomiao Ni","Peng Jin","Sharon X. Huang","Yuan Xue"],"pdf_url":"https://arxiv.org/pdf/2308.04020v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.04016v1","updated":"2023-08-08T03:24:21Z","published":"2023-08-08T03:24:21Z","title":"Hierarchical Visual Primitive Experts for Compositional Zero-Shot\n  Learning","summary":"  Compositional zero-shot learning (CZSL) aims to recognize unseen compositions\nwith prior knowledge of known primitives (attribute and object). Previous works\nfor CZSL often suffer from grasping the contextuality between attribute and\nobject, as well as the discriminability of visual features, and the long-tailed\ndistribution of real-world compositional data. We propose a simple and scalable\nframework called Composition Transformer (CoT) to address these issues. CoT\nemploys object and attribute experts in distinctive manners to generate\nrepresentative embeddings, using the visual network hierarchically. The object\nexpert extracts representative object embeddings from the final layer in a\nbottom-up manner, while the attribute expert makes attribute embeddings in a\ntop-down manner with a proposed object-guided attention module that models\ncontextuality explicitly. To remedy biased prediction caused by imbalanced data\ndistribution, we develop a simple minority attribute augmentation (MAA) that\nsynthesizes virtual samples by mixing two images and oversampling minority\nattribute classes. Our method achieves SoTA performance on several benchmarks,\nincluding MIT-States, C-GQA, and VAW-CZSL. We also demonstrate the\neffectiveness of CoT in improving visual discrimination and addressing the\nmodel bias from the imbalanced data distribution. The code is available at\nhttps://github.com/HanjaeKim98/CoT.\n","authors":["Hanjae Kim","Jiyoung Lee","Seongheon Park","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.04016v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04008v1","updated":"2023-08-08T03:06:10Z","published":"2023-08-08T03:06:10Z","title":"Coarse-to-Fine: Learning Compact Discriminative Representation for\n  Single-Stage Image Retrieval","summary":"  Image retrieval targets to find images from a database that are visually\nsimilar to the query image. Two-stage methods following retrieve-and-rerank\nparadigm have achieved excellent performance, but their separate local and\nglobal modules are inefficient to real-world applications. To better trade-off\nretrieval efficiency and accuracy, some approaches fuse global and local\nfeature into a joint representation to perform single-stage image retrieval.\nHowever, they are still challenging due to various situations to tackle,\n$e.g.$, background, occlusion and viewpoint. In this work, we design a\nCoarse-to-Fine framework to learn Compact Discriminative representation (CFCD)\nfor end-to-end single-stage image retrieval-requiring only image-level labels.\nSpecifically, we first design a novel adaptive softmax-based loss which\ndynamically tunes its scale and margin within each mini-batch and increases\nthem progressively to strengthen supervision during training and intra-class\ncompactness. Furthermore, we propose a mechanism which attentively selects\nprominent local descriptors and infuse fine-grained semantic relations into the\nglobal representation by a hard negative sampling strategy to optimize\ninter-class distinctiveness at a global scale. Extensive experimental results\nhave demonstrated the effectiveness of our method, which achieves\nstate-of-the-art single-stage image retrieval performance on benchmarks such as\nRevisited Oxford and Revisited Paris. Code is available at\nhttps://github.com/bassyess/CFCD.\n","authors":["Yunquan Zhu","Xinkai Gao","Bo Ke","Ruizhi Qiao","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04008v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04005v1","updated":"2023-08-08T02:48:46Z","published":"2023-08-08T02:48:46Z","title":"Few-shot medical image classification with simple shape and texture text\n  descriptors using vision-language models","summary":"  In this work, we investigate the usefulness of vision-language models (VLMs)\nand large language models for binary few-shot classification of medical images.\nWe utilize the GPT-4 model to generate text descriptors that encapsulate the\nshape and texture characteristics of objects in medical images. Subsequently,\nthese GPT-4 generated descriptors, alongside VLMs pre-trained on natural\nimages, are employed to classify chest X-rays and breast ultrasound images. Our\nresults indicate that few-shot classification of medical images using VLMs and\nGPT-4 generated descriptors is a viable approach. However, accurate\nclassification requires to exclude certain descriptors from the calculations of\nthe classification scores. Moreover, we assess the ability of VLMs to evaluate\nshape features in breast mass ultrasound images. We further investigate the\ndegree of variability among the sets of text descriptors produced by GPT-4. Our\nwork provides several important insights about the application of VLMs for\nmedical image analysis.\n","authors":["Michal Byra","Muhammad Febrian Rachmadi","Henrik Skibbe"],"pdf_url":"https://arxiv.org/pdf/2308.04005v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.10044v3","updated":"2023-08-08T02:40:05Z","published":"2023-05-17T08:37:26Z","title":"Two-Stream Regression Network for Dental Implant Position Prediction","summary":"  In implant prosthesis treatment, the design of the surgical guide heavily\nrelies on the manual location of the implant position, which is subjective and\nprone to doctor's experiences. When deep learning based methods has started to\nbe applied to address this problem, the space between teeth are various and\nsome of them might present similar texture characteristic with the actual\nimplant region. Both problems make a big challenge for the implant position\nprediction. In this paper, we develop a two-stream implant position regression\nframework (TSIPR), which consists of an implant region detector (IRD) and a\nmulti-scale patch embedding regression network (MSPENet), to address this\nissue. For the training of IRD, we extend the original annotation to provide\nadditional supervisory information, which contains much more rich\ncharacteristic and do not introduce extra labeling costs. A multi-scale patch\nembedding module is designed for the MSPENet to adaptively extract features\nfrom the images with various tooth spacing. The global-local feature\ninteraction block is designed to build the encoder of MSPENet, which combines\nthe transformer and convolution for enriched feature representation. During\ninference, the RoI mask extracted from the IRD is used to refine the prediction\nresults of the MSPENet. Extensive experiments on a dental implant dataset\nthrough five-fold cross-validation demonstrated that the proposed TSIPR\nachieves superior performance than existing methods.\n","authors":["Xinquan Yang","Xuguang Li","Xuechen Li","Wenting Chen","Linlin Shen","Xin Li","Yongqiang Deng"],"pdf_url":"https://arxiv.org/pdf/2305.10044v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12676v3","updated":"2023-08-08T02:32:24Z","published":"2023-07-24T10:30:54Z","title":"Damage Vision Mining Opportunity for Imbalanced Anomaly Detection","summary":"  In past decade, previous balanced datasets have been used to advance\nalgorithms for classification, object detection, semantic segmentation, and\nanomaly detection in industrial applications. Specifically, for condition-based\nmaintenance, automating visual inspection is crucial to ensure high quality.\nDeterioration prognostic attempts to optimize the fine decision process for\npredictive maintenance and proactive repair. In civil infrastructure and living\nenvironment, damage data mining cannot avoid the imbalanced data issue because\nof rare unseen events and high quality status by improved operations. For\nvisual inspection, deteriorated class acquired from the surface of concrete and\nsteel components are occasionally imbalanced. From numerous related surveys, we\nsummarize that imbalanced data problems can be categorized into four types; 1)\nmissing range of target and label valuables, 2) majority-minority class\nimbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class\nof pixel-wise imbalance. Since 2015, there has been many imbalanced studies\nusing deep learning approaches that includes regression, image classification,\nobject detection, semantic segmentation. However, anomaly detection for\nimbalanced data is not yet well known. In the study, we highlight one-class\nanomaly detection application whether anomalous class or not, and demonstrate\nclear examples on imbalanced vision datasets: blood smear, lung infection,\nhazardous driving, wooden, concrete deterioration, river sludge, and disaster\ndamage. Illustrated in Fig.1, we provide key results on damage vision mining\nadvantage, hypothesizing that the more effective range of positive ratio, the\nhigher accuracy gain of anomaly detection application. In our imbalanced\nstudies, compared with the balanced case of positive ratio 1/1, we find that\nthere is applicable positive ratio, where the accuracy are consistently high.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v3.pdf","comment":"21 pages, 29 figures, 18 tables"},{"id":"http://arxiv.org/abs/2308.02494v2","updated":"2023-08-08T02:32:04Z","published":"2023-07-16T19:36:19Z","title":"Adaptively Placed Multi-Grid Scene Representation Networks for\n  Large-Scale Data Visualization","summary":"  Scene representation networks (SRNs) have been recently proposed for\ncompression and visualization of scientific data. However, state-of-the-art\nSRNs do not adapt the allocation of available network parameters to the complex\nfeatures found in scientific data, leading to a loss in reconstruction quality.\nWe address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN)\nand propose a domain decomposition training and inference technique for\naccelerated parallel training on multi-GPU systems. We also release an\nopen-source neural volume rendering application that allows plug-and-play\nrendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses\nmultiple spatially adaptive feature grids that learn where to be placed within\nthe domain to dynamically allocate more neural network resources where error is\nhigh in the volume, improving state-of-the-art reconstruction accuracy of SRNs\nfor scientific data without requiring expensive octree refining, pruning, and\ntraversal like previous adaptive models. In our domain decomposition approach\nfor representing large-scale data, we train an set of APMGSRNs in parallel on\nseparate bricks of the volume to reduce training time while avoiding overhead\nnecessary for an out-of-core solution for volumes too large to fit in GPU\nmemory. After training, the lightweight SRNs are used for realtime neural\nvolume rendering in our open-source renderer, where arbitrary view angles and\ntransfer functions can be explored. A copy of this paper, all code, all models\nused in our experiments, and all supplemental materials and videos are\navailable at https://github.com/skywolf829/APMGSRN.\n","authors":["Skylar Wolfgang Wurster","Tianyu Xiong","Han-Wei Shen","Hanqi Guo","Tom Peterka"],"pdf_url":"https://arxiv.org/pdf/2308.02494v2.pdf","comment":"Accepted to IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2308.03999v1","updated":"2023-08-08T02:28:50Z","published":"2023-08-08T02:28:50Z","title":"Understanding CNN Hidden Neuron Activations using Structured Background\n  Knowledge and Deductive Reasoning","summary":"  A major challenge in Explainable AI is in correctly interpreting activations\nof hidden neurons: accurate interpretations would provide insights into the\nquestion of what a deep learning system has internally detected as relevant on\nthe input, de-mystifying the otherwise black-box character of deep learning\nsystems. The state of the art indicates that hidden node activations can, in\nsome cases, be interpretable in a way that makes sense to humans, but\nsystematic automated methods that would be able to hypothesize and verify\ninterpretations of hidden neuron activations are underexplored. In this paper,\nwe provide such a method and demonstrate that it provides meaningful\ninterpretations. Our approach is based on using large-scale background\nknowledge approximately 2 million classes curated from the Wikipedia concept\nhierarchy together with a symbolic reasoning approach called Concept Induction\nbased on description logics, originally developed for applications in the\nSemantic Web field. Our results show that we can automatically attach\nmeaningful labels from the background knowledge to individual neurons in the\ndense layer of a Convolutional Neural Network through a hypothesis and\nverification process\n","authors":["Abhilekha Dalal","Md Kamruzzaman Sarker","Adrita Barua","Eugene Vasserman","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03998v1","updated":"2023-08-08T02:28:48Z","published":"2023-08-08T02:28:48Z","title":"Real-time Strawberry Detection Based on Improved YOLOv5s Architecture\n  for Robotic Harvesting in open-field environment","summary":"  This study proposed a YOLOv5-based custom object detection model to detect\nstrawberries in an outdoor environment. The original architecture of the\nYOLOv5s was modified by replacing the C3 module with the C2f module in the\nbackbone network, which provided a better feature gradient flow. Secondly, the\nSpatial Pyramid Pooling Fast in the final layer of the backbone network of\nYOLOv5s was combined with Cross Stage Partial Net to improve the generalization\nability over the strawberry dataset in this study. The proposed architecture\nwas named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with\nthree maturity classes (immature, nearly mature, and mature) was collected in\nopen-field environment and augmented through a series of operations including\nbrightness reduction, brightness increase, and noise adding. To verify the\nsuperiority of the proposed method for strawberry detection in open-field\nenvironment, four competitive detection models (YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational\nenvironment and compared with YOLOv5s-Straw. The results showed that the\nhighest mean average precision of 80.3% was achieved using the proposed\narchitecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.\nSpecifically, the average precision of YOLOv5s-Straw was 82.1% in the immature\nclass, 73.5% in the nearly mature class, and 86.6% in the mature class, which\nwere 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The\nmodel included 8.6*10^6 network parameters with an inference speed of 18ms per\nimage while the inference speed of YOLOv8s had a slower inference speed of\n21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed\nmodel is fast enough for real time strawberry detection and localization for\nthe robotic picking.\n","authors":["Zixuan He","Salik Ram Khana","Xin Zhang","Manoj Karkee","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03998v1.pdf","comment":"20 pages; 15 figures"},{"id":"http://arxiv.org/abs/2307.02227v2","updated":"2023-08-08T02:19:48Z","published":"2023-07-05T12:08:56Z","title":"MAE-DFER: Efficient Masked Autoencoder for Self-supervised Dynamic\n  Facial Expression Recognition","summary":"  Dynamic facial expression recognition (DFER) is essential to the development\nof intelligent and empathetic machines. Prior efforts in this field mainly fall\ninto supervised learning paradigm, which is severely restricted by the limited\nlabeled data in existing datasets. Inspired by recent unprecedented success of\nmasked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel\nself-supervised method which leverages large-scale self-supervised pre-training\non abundant unlabeled data to largely advance the development of DFER. Since\nthe vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial\ncomputation during fine-tuning, MAE-DFER develops an efficient local-global\ninteraction Transformer (LGI-Former) as the encoder. Moreover, in addition to\nthe standalone appearance content reconstruction in VideoMAE, MAE-DFER also\nintroduces explicit temporal facial motion modeling to encourage LGI-Former to\nexcavate both static appearance and dynamic motion information. Extensive\nexperiments on six datasets show that MAE-DFER consistently outperforms\nstate-of-the-art supervised methods by significant margins (e.g., +6.30\\% UAR\non DFEW and +8.34\\% UAR on MAFW), verifying that it can learn powerful dynamic\nfacial representations via large-scale self-supervised pre-training. Besides,\nit has comparable or even better performance than VideoMAE, while largely\nreducing the computational cost (about 38\\% FLOPs). We believe MAE-DFER has\npaved a new way for the advancement of DFER and can inspire more relevant\nresearch in this field and even other related tasks. Codes and models are\npublicly available at https://github.com/sunlicai/MAE-DFER.\n","authors":["Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2307.02227v2.pdf","comment":"ACM MM 2023 (camera ready). Codes and models are publicly available\n  at https://github.com/sunlicai/MAE-DFER"},{"id":"http://arxiv.org/abs/2308.03982v1","updated":"2023-08-08T01:59:20Z","published":"2023-08-08T01:59:20Z","title":"PARTNER: Level up the Polar Representation for LiDAR 3D Object Detection","summary":"  Recently, polar-based representation has shown promising properties in\nperceptual tasks. In addition to Cartesian-based approaches, which separate\npoint clouds unevenly, representing point clouds as polar grids has been\nrecognized as an alternative due to (1) its advantage in robust performance\nunder different resolutions and (2) its superiority in streaming-based\napproaches. However, state-of-the-art polar-based detection methods inevitably\nsuffer from the feature distortion problem because of the non-uniform division\nof polar representation, resulting in a non-negligible performance gap compared\nto Cartesian-based approaches. To tackle this issue, we present PARTNER, a\nnovel 3D object detector in the polar coordinate. PARTNER alleviates the\ndilemma of feature distortion with global representation re-alignment and\nfacilitates the regression by introducing instance-level geometric information\ninto the detection head. Extensive experiments show overwhelming advantages in\nstreaming-based detection and different resolutions. Furthermore, our method\noutperforms the previous polar-based works with remarkable margins of 3.68% and\n9.15% on Waymo and ONCE validation set, thus achieving competitive results over\nthe state-of-the-art methods.\n","authors":["Ming Nie","Yujing Xue","Chunwei Wang","Chaoqiang Ye","Hang Xu","Xinge Zhu","Qingqiu Huang","Michael Bi Mi","Xinchao Wang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03982v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03979v1","updated":"2023-08-08T01:55:44Z","published":"2023-08-08T01:55:44Z","title":"PAIF: Perception-Aware Infrared-Visible Image Fusion for Attack-Tolerant\n  Semantic Segmentation","summary":"  Infrared and visible image fusion is a powerful technique that combines\ncomplementary information from different modalities for downstream semantic\nperception tasks. Existing learning-based methods show remarkable performance,\nbut are suffering from the inherent vulnerability of adversarial attacks,\ncausing a significant decrease in accuracy. In this work, a perception-aware\nfusion framework is proposed to promote segmentation robustness in adversarial\nscenes. We first conduct systematic analyses about the components of image\nfusion, investigating the correlation with segmentation robustness under\nadversarial perturbations. Based on these analyses, we propose a harmonized\narchitecture search with a decomposition-based structure to balance standard\naccuracy and robustness. We also propose an adaptive learning strategy to\nimprove the parameter robustness of image fusion, which can learn effective\nfeature extraction under diverse adversarial perturbations. Thus, the goals of\nimage fusion (\\textit{i.e.,} extracting complementary features from source\nmodalities and defending attack) can be realized from the perspectives of\narchitectural and learning strategies. Extensive experimental results\ndemonstrate that our scheme substantially enhances the robustness, with gains\nof 15.3% mIOU of segmentation in the adversarial scene, compared with advanced\ncompetitors. The source codes are available at\nhttps://github.com/LiuZhu-CV/PAIF.\n","authors":["Zhu Liu","Jinyuan Liu","Benzhuang Zhang","Long Ma","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03979v1.pdf","comment":"Accepted by ACM MM'2023;The source codes are available at\n  https://github.com/LiuZhu-CV/PAIF"},{"id":"http://arxiv.org/abs/2308.03276v2","updated":"2023-08-08T01:55:32Z","published":"2023-08-07T03:35:47Z","title":"Spatialyze: A Geospatial Video Analytics System with Spatial-Aware\n  Optimizations","summary":"  Videos that are shot using commodity hardware such as phones and surveillance\ncameras record various metadata such as time and location. We encounter such\ngeospatial videos on a daily basis and such videos have been growing in volume\nsignificantly. Yet, we do not have data management systems that allow users to\ninteract with such data effectively.\n  In this paper, we describe Spatialyze, a new framework for end-to-end\nquerying of geospatial videos. Spatialyze comes with a domain-specific language\nwhere users can construct geospatial video analytic workflows using a 3-step,\ndeclarative, build-filter-observe paradigm. Internally, Spatialyze leverages\nthe declarative nature of such workflows, the temporal-spatial metadata stored\nwith videos, and physical behavior of real-world objects to optimize the\nexecution of workflows. Our results using real-world videos and workflows show\nthat Spatialyze can reduce execution time by up to 5.3x, while maintaining up\nto 97.1% accuracy compared to unoptimized execution.\n","authors":["Chanwut Kittivorawong","Yongming Ge","Yousef Helal","Alvin Cheung"],"pdf_url":"https://arxiv.org/pdf/2308.03276v2.pdf","comment":"GitHub Repository: https://github.com/apperception-db/spatialyze"},{"id":"http://arxiv.org/abs/2301.01635v3","updated":"2023-08-08T01:45:37Z","published":"2023-01-04T14:20:14Z","title":"SPTS v2: Single-Point Scene Text Spotting","summary":"  End-to-end scene text spotting has made significant progress due to its\nintrinsic synergy between text detection and recognition. Previous methods\ncommonly regard manual annotations such as horizontal rectangles, rotated\nrectangles, quadrangles, and polygons as a prerequisite, which are much more\nexpensive than using single-point. Our new framework, SPTS v2, allows us to\ntrain high-performing text-spotting models using a single-point annotation.\nSPTS v2 reserves the advantage of the auto-regressive Transformer with an\nInstance Assignment Decoder (IAD) through sequentially predicting the center\npoints of all text instances inside the same predicting sequence, while with a\nParallel Recognition Decoder (PRD) for text recognition in parallel. These two\ndecoders share the same parameters and are interactively connected with a\nsimple but effective information transmission process to pass the gradient and\ninformation. Comprehensive experiments on various existing benchmark datasets\ndemonstrate the SPTS v2 can outperform previous state-of-the-art single-point\ntext spotters with fewer parameters while achieving 19$\\times$ faster inference\nspeed. Within the context of our SPTS v2 framework, our experiments suggest a\npotential preference for single-point representation in scene text spotting\nwhen compared to other representations. Such an attempt provides a significant\nopportunity for scene text spotting applications beyond the realms of existing\nparadigms. Code is available at https://github.com/Yuliang-Liu/SPTSv2.\n","authors":["Yuliang Liu","Jiaxin Zhang","Dezhi Peng","Mingxin Huang","Xinyu Wang","Jingqun Tang","Can Huang","Dahua Lin","Chunhua Shen","Xiang Bai","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2301.01635v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2112.07917"},{"id":"http://arxiv.org/abs/2307.12450v2","updated":"2023-08-08T01:42:17Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v2.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2308.03286v2","updated":"2023-08-08T01:34:30Z","published":"2023-08-07T04:04:22Z","title":"Multi-Label Self-Supervised Learning with Scene Images","summary":"  Self-supervised learning (SSL) methods targeting scene images have seen a\nrapid growth recently, and they mostly rely on either a dedicated dense\nmatching mechanism or a costly unsupervised object discovery module. This paper\nshows that instead of hinging on these strenuous operations, quality image\nrepresentations can be learned by treating scene/multi-label image SSL simply\nas a multi-label classification problem, which greatly simplifies the learning\nframework. Specifically, multiple binary pseudo-labels are assigned for each\ninput image by comparing its embeddings with those in two dictionaries, and the\nnetwork is optimized using the binary cross entropy loss. The proposed method\nis named Multi-Label Self-supervised learning (MLS). Visualizations\nqualitatively show that clearly the pseudo-labels by MLS can automatically find\nsemantically similar pseudo-positive pairs across different images to\nfacilitate contrastive learning. MLS learns high quality representations on\nMS-COCO and achieves state-of-the-art results on classification, detection and\nsegmentation benchmarks. At the same time, MLS is much simpler than existing\nmethods, making it easier to deploy and for further exploration.\n","authors":["Ke Zhu","Minghao Fu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2308.03286v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.03977v1","updated":"2023-08-08T01:33:13Z","published":"2023-08-08T01:33:13Z","title":"PUG: Photorealistic and Semantically Controllable Synthetic Data for\n  Representation Learning","summary":"  Synthetic image datasets offer unmatched advantages for designing and\nevaluating deep neural networks: they make it possible to (i) render as many\ndata samples as needed, (ii) precisely control each scene and yield granular\nground truth labels (and captions), (iii) precisely control distribution shifts\nbetween training and testing to isolate variables of interest for sound\nexperimentation. Despite such promise, the use of synthetic image data is still\nlimited -- and often played down -- mainly due to their lack of realism. Most\nworks therefore rely on datasets of real images, which have often been scraped\nfrom public images on the internet, and may have issues with regards to\nprivacy, bias, and copyright, while offering little control over how objects\nprecisely appear. In this work, we present a path to democratize the use of\nphotorealistic synthetic data: we develop a new generation of interactive\nenvironments for representation learning research, that offer both\ncontrollability and realism. We use the Unreal Engine, a powerful game engine\nwell known in the entertainment industry, to produce PUG (Photorealistic Unreal\nGraphics) environments and datasets for representation learning. In this paper,\nwe demonstrate the potential of PUG to enable more rigorous evaluations of\nvision models.\n","authors":["Florian Bordes","Shashank Shekhar","Mark Ibrahim","Diane Bouchacourt","Pascal Vincent","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.03977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02552v2","updated":"2023-08-08T01:30:26Z","published":"2023-08-02T03:34:44Z","title":"Degeneration-Tuning: Using Scrambled Grid shield Unwanted Concepts from\n  Stable Diffusion","summary":"  Owing to the unrestricted nature of the content in the training data, large\ntext-to-image diffusion models, such as Stable Diffusion (SD), are capable of\ngenerating images with potentially copyrighted or dangerous content based on\ncorresponding textual concepts information. This includes specific intellectual\nproperty (IP), human faces, and various artistic styles. However, Negative\nPrompt, a widely used method for content removal, frequently fails to conceal\nthis content due to inherent limitations in its inference logic. In this work,\nwe propose a novel strategy named \\textbf{Degeneration-Tuning (DT)} to shield\ncontents of unwanted concepts from SD weights. By utilizing Scrambled Grid to\nreconstruct the correlation between undesired concepts and their corresponding\nimage domain, we guide SD to generate meaningless content when such textual\nconcepts are provided as input. As this adaptation occurs at the level of the\nmodel's weights, the SD, after DT, can be grafted onto other conditional\ndiffusion frameworks like ControlNet to shield unwanted concepts. In addition\nto qualitatively showcasing the effectiveness of our DT method in protecting\nvarious types of concepts, a quantitative comparison of the SD before and after\nDT indicates that the DT method does not significantly impact the generative\nquality of other contents. The FID and IS scores of the model on COCO-30K\nexhibit only minor changes after DT, shifting from 12.61 and 39.20 to 13.04 and\n38.25, respectively, which clearly outperforms the previous methods.\n","authors":["Zixuan Ni","Longhui Wei","Jiacheng Li","Siliang Tang","Yueting Zhuang","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2308.02552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03975v1","updated":"2023-08-08T01:27:55Z","published":"2023-08-08T01:27:55Z","title":"Prompted Contrast with Masked Motion Modeling: Towards Versatile 3D\n  Action Representation Learning","summary":"  Self-supervised learning has proved effective for skeleton-based human action\nunderstanding, which is an important yet challenging topic. Previous works\nmainly rely on contrastive learning or masked motion modeling paradigm to model\nthe skeleton relations. However, the sequence-level and joint-level\nrepresentation learning cannot be effectively and simultaneously handled by\nthese methods. As a result, the learned representations fail to generalize to\ndifferent downstream tasks. Moreover, combining these two paradigms in a naive\nmanner leaves the synergy between them untapped and can lead to interference in\ntraining. To address these problems, we propose Prompted Contrast with Masked\nMotion Modeling, PCM$^{\\rm 3}$, for versatile 3D action representation\nlearning. Our method integrates the contrastive learning and masked prediction\ntasks in a mutually beneficial manner, which substantially boosts the\ngeneralization capacity for various downstream tasks. Specifically, masked\nprediction provides novel training views for contrastive learning, which in\nturn guides the masked prediction training with high-level semantic\ninformation. Moreover, we propose a dual-prompted multi-task pretraining\nstrategy, which further improves model representations by reducing the\ninterference caused by learning the two different pretext tasks. Extensive\nexperiments on five downstream tasks under three large-scale datasets are\nconducted, demonstrating the superior generalization capacity of PCM$^{\\rm 3}$\ncompared to the state-of-the-art works. Our project is publicly available at:\nhttps://jhang2020.github.io/Projects/PCM3/PCM3.html .\n","authors":["Jiahang Zhang","Lilang Lin","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2308.03975v1.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.03968v1","updated":"2023-08-08T00:46:01Z","published":"2023-08-08T00:46:01Z","title":"CheXFusion: Effective Fusion of Multi-View Features using Transformers\n  for Long-Tailed Chest X-Ray Classification","summary":"  Medical image classification poses unique challenges due to the long-tailed\ndistribution of diseases, the co-occurrence of diagnostic findings, and the\nmultiple views available for each study or patient. This paper introduces our\nsolution to the ICCV CVAMD 2023 Shared Task on CXR-LT: Multi-Label Long-Tailed\nClassification on Chest X-Rays. Our approach introduces CheXFusion, a\ntransformer-based fusion module incorporating multi-view images. The fusion\nmodule, guided by self-attention and cross-attention mechanisms, efficiently\naggregates multi-view features while considering label co-occurrence.\nFurthermore, we explore data balancing and self-training methods to optimize\nthe model's performance. Our solution achieves state-of-the-art results with\n0.372 mAP in the MIMIC-CXR test set, securing 1st place in the competition. Our\nsuccess in the task underscores the significance of considering multi-view\nsettings, class imbalance, and label co-occurrence in medical image\nclassification. Public code is available at\nhttps://github.com/dongkyuk/CXR-LT-public-solution\n","authors":["Dongkyun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.03968v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2210.07774v3","updated":"2023-08-08T17:48:12Z","published":"2022-09-19T22:57:10Z","title":"Learning To Rank Diversely At Airbnb","summary":"  Airbnb is a two-sided marketplace, bringing together hosts who own listings\nfor rent, with prospective guests from around the globe. Applying neural\nnetwork-based learning to rank techniques has led to significant improvements\nin matching guests with hosts. These improvements in ranking were driven by a\ncore strategy: order the listings by their estimated booking probabilities,\nthen iterate on techniques to make these booking probability estimates more and\nmore accurate. Embedded implicitly in this strategy was an assumption that the\nbooking probability of a listing could be determined independently of other\nlistings in search results. In this paper we discuss how this assumption,\npervasive throughout the commonly-used learning to rank frameworks, is false.\nWe provide a theoretical foundation correcting this assumption, followed by\nefficient neural network architectures based on the theory. Explicitly\naccounting for possible similarities between listings, and reducing them to\ndiversify the search results generated strong positive impact. We discuss these\nmetric wins as part of the online A/B tests of the theory. Our method provides\na practical way to diversify search results for large-scale production ranking\nsystems.\n","authors":["Malay Haldar","Mustafa Abdool","Liwei He","Dillon Davis","Huiji Gao","Sanjeev Katariya"],"pdf_url":"https://arxiv.org/pdf/2210.07774v3.pdf","comment":"Search ranking, Diversity, e-commerce"},{"id":"http://arxiv.org/abs/2112.06668v2","updated":"2023-08-08T16:32:12Z","published":"2021-12-13T13:42:35Z","title":"CT4Rec: Simple yet Effective Consistency Training for Sequential\n  Recommendation","summary":"  Sequential recommendation methods play an important role in real-world\nrecommender systems. These systems are able to catch user preferences by taking\nadvantage of historical records and then performing recommendations.\nContrastive learning(CL) is a cutting-edge technology that can assist us in\nobtaining informative user representations, but these CL-based models need\nsubtle negative sampling strategies, tedious data augmentation methods, and\nheavy hyper-parameters tuning work. In this paper, we introduce another way to\ngenerate better user representations and recommend more attractive items to\nusers. Particularly, we put forward an effective \\textbf{C}onsistency\n\\textbf{C}onstraint for sequential \\textbf{Rec}ommendation(C$^2$-Rec) in which\nonly two extra training objectives are used without any structural\nmodifications and data augmentation strategies. Substantial experiments have\nbeen conducted on three benchmark datasets and one real industrial dataset,\nwhich proves that our proposed method outperforms SOTA models substantially.\nFurthermore, our method needs much less training time than those CL-based\nmodels. Online AB-test on real-world recommendation systems also achieves\n10.141\\% improvement on the click-through rate and 10.541\\% increase on the\naverage click number per capita. The code is available at\n\\url{https://github.com/zhengrongqin/C2-Rec}.\n","authors":["Chong Liu","Xiaoyang Liu","Rongqin Zheng","Lixin Zhang","Xiaobo Liang","Juntao Li","Lijun Wu","Min Zhang","Leyu Lin"],"pdf_url":"https://arxiv.org/pdf/2112.06668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04380v1","updated":"2023-08-08T16:31:43Z","published":"2023-08-08T16:31:43Z","title":"Your Negative May not Be True Negative: Boosting Image-Text Matching\n  with False Negative Elimination","summary":"  Most existing image-text matching methods adopt triplet loss as the\noptimization objective, and choosing a proper negative sample for the triplet\nof <anchor, positive, negative> is important for effectively training the\nmodel, e.g., hard negatives make the model learn efficiently and effectively.\nHowever, we observe that existing methods mainly employ the most similar\nsamples as hard negatives, which may not be true negatives. In other words, the\nsamples with high similarity but not paired with the anchor may reserve\npositive semantic associations, and we call them false negatives. Repelling\nthese false negatives in triplet loss would mislead the semantic representation\nlearning and result in inferior retrieval performance. In this paper, we\npropose a novel False Negative Elimination (FNE) strategy to select negatives\nvia sampling, which could alleviate the problem introduced by false negatives.\nSpecifically, we first construct the distributions of positive and negative\nsamples separately via their similarities with the anchor, based on the\nfeatures extracted from image and text encoders. Then we calculate the false\nnegative probability of a given sample based on its similarity with the anchor\nand the above distributions via the Bayes' rule, which is employed as the\nsampling weight during negative sampling process. Since there may not exist any\nfalse negative in a small batch size, we design a memory module with momentum\nto retain a large negative buffer and implement our negative sampling strategy\nspanning over the buffer. In addition, to make the model focus on hard\nnegatives, we reassign the sampling weights for the simple negatives with a\ncut-down strategy. The extensive experiments are conducted on Flickr30K and\nMS-COCO, and the results demonstrate the superiority of our proposed false\nnegative elimination strategy. The code is available at\nhttps://github.com/LuminosityX/FNE.\n","authors":["Haoxuan Li","Yi Bin","Junrong Liao","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04380v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03735v2","updated":"2023-08-08T16:20:18Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v2.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2308.04343v1","updated":"2023-08-08T15:43:59Z","published":"2023-08-08T15:43:59Z","title":"Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval","summary":"  Most existing cross-modal retrieval methods employ two-stream encoders with\ndifferent architectures for images and texts, \\textit{e.g.}, CNN for images and\nRNN/Transformer for texts. Such discrepancy in architectures may induce\ndifferent semantic distribution spaces and limit the interactions between\nimages and texts, and further result in inferior alignment between images and\ntexts. To fill this research gap, inspired by recent advances of Transformers\nin vision tasks, we propose to unify the encoder architectures with\nTransformers for both modalities. Specifically, we design a cross-modal\nretrieval framework purely based on two-stream Transformers, dubbed\n\\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image\nTransformer, a text Transformer, and a hierarchical alignment module. With such\nidentical architectures, the encoders could produce representations with more\nsimilar characteristics for images and texts, and make the interactions and\nalignments between them much easier. Besides, to leverage the rich semantics,\nwe devise a hierarchical alignment scheme to explore multi-level\ncorrespondences of different layers between images and texts. To evaluate the\neffectiveness of the proposed HAT, we conduct extensive experiments on two\nbenchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that\nHAT outperforms SOTA baselines by a large margin. Specifically, on two key\ntasks, \\textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves\n7.6\\% and 16.7\\% relative score improvement of Recall@1 on MSCOCO, and 4.4\\%\nand 11.6\\% on Flickr30k respectively. The code is available at\n\\url{https://github.com/LuminosityX/HAT}.\n","authors":["Yi Bin","Haoxuan Li","Yahui Xu","Xing Xu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04343v1.pdf","comment":"Accepted at ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04258v1","updated":"2023-08-08T13:46:55Z","published":"2023-08-08T13:46:55Z","title":"Advancing Natural-Language Based Audio Retrieval with PaSST and Large\n  Audio-Caption Data Sets","summary":"  This work presents a text-to-audio-retrieval system based on pre-trained text\nand spectrogram transformers. Our method projects recordings and textual\ndescriptions into a shared audio-caption space in which related examples from\ndifferent modalities are close. Through a systematic analysis, we examine how\neach component of the system influences retrieval performance. As a result, we\nidentify two key components that play a crucial role in driving performance:\nthe self-attention-based audio encoder for audio embedding and the utilization\nof additional human-generated and synthetic data sets during pre-training. We\nfurther experimented with augmenting ClothoV2 captions with available keywords\nto increase their variety; however, this only led to marginal improvements. Our\nsystem ranked first in the 2023's DCASE Challenge, and it outperforms the\ncurrent state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.\n","authors":["Paul Primus","Khaled Koutini","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2308.04258v1.pdf","comment":"submitted to DCASE Workshop 2023"},{"id":"http://arxiv.org/abs/2308.04247v1","updated":"2023-08-08T13:26:36Z","published":"2023-08-08T13:26:36Z","title":"UniRecSys: A Unified Framework for Personalized, Group, Package, and\n  Package-to-Group Recommendations","summary":"  Recommender systems aim to enhance the overall user experience by providing\ntailored recommendations for a variety of products and services. These systems\nhelp users make more informed decisions, leading to greater user satisfaction\nwith the platform. However, the implementation of these systems largely depends\non the context, which can vary from recommending an item or package to a user\nor a group. This requires careful exploration of several models during the\ndeployment, as there is no comprehensive and unified approach that deals with\nrecommendations at different levels. Furthermore, these individual models must\nbe closely attuned to their generated recommendations depending on the context\nto prevent significant variation in their generated recommendations. In this\npaper, we propose a novel unified recommendation framework that addresses all\nfour recommendation tasks, namely personalized, group, package, or\npackage-to-group recommendation, filling the gap in the current research\nlandscape. The proposed framework can be integrated with most of the\ntraditional matrix factorization-based collaborative filtering models. The idea\nis to enhance the formulation of the existing approaches by incorporating\ncomponents focusing on the exploitation of the group and package latent\nfactors. These components also help in exploiting a rich latent representation\nof the user/item by enforcing them to align closely with their corresponding\ngroup/package representation. We consider two prominent CF techniques,\nRegularized Matrix Factorization and Maximum Margin Matrix factorization, as\nthe baseline models and demonstrate their customization to various\nrecommendation tasks. Experiment results on two publicly available datasets are\nreported, comparing them to other baseline approaches that consider individual\nrating feedback for group or package recommendations.\n","authors":["Adamya Shyam","Vikas Kumar","Venkateswara Rao Kagita","Arun K Pujari"],"pdf_url":"https://arxiv.org/pdf/2308.04247v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2308.04226v1","updated":"2023-08-08T12:45:01Z","published":"2023-08-08T12:45:01Z","title":"OpinionConv: Conversational Product Search with Grounded Opinions","summary":"  When searching for products, the opinions of others play an important role in\nmaking informed decisions. Subjective experiences about a product can be a\nvaluable source of information. This is also true in sales conversations, where\na customer and a sales assistant exchange facts and opinions about products.\nHowever, training an AI for such conversations is complicated by the fact that\nlanguage models do not possess authentic opinions for their lack of real-world\nexperience. We address this problem by leveraging product reviews as a rich\nsource of product opinions to ground conversational AI in true subjective\nnarratives. With OpinionConv, we develop the first conversational AI for\nsimulating sales conversations. To validate the generated conversations, we\nconduct several user studies showing that the generated opinions are perceived\nas realistic. Our assessors also confirm the importance of opinions as an\ninformative basis for decision-making.\n","authors":["Vahid Sadiri Javadi","Martin Potthast","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.04226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10046v2","updated":"2023-08-08T09:46:21Z","published":"2023-06-12T08:21:50Z","title":"Document Layout Annotation: Database and Benchmark in the Domain of\n  Public Affairs","summary":"  Every day, thousands of digital documents are generated with useful\ninformation for companies, public organizations, and citizens. Given the\nimpossibility of processing them manually, the automatic processing of these\ndocuments is becoming increasingly necessary in certain sectors. However, this\ntask remains challenging, since in most cases a text-only based parsing is not\nenough to fully understand the information presented through different\ncomponents of varying significance. In this regard, Document Layout Analysis\n(DLA) has been an interesting research field for many years, which aims to\ndetect and classify the basic components of a document. In this work, we used a\nprocedure to semi-automatically annotate digital documents with different\nlayout labels, including 4 basic layout blocks and 4 text categories. We apply\nthis procedure to collect a novel database for DLA in the public affairs\ndomain, using a set of 24 data sources from the Spanish Administration. The\ndatabase comprises 37.9K documents with more than 441K document pages, and more\nthan 8M labels associated to 8 layout block units. The results of our\nexperiments validate the proposed text labeling procedure with accuracy up to\n99%.\n","authors":["Alejandro Peña","Aythami Morales","Julian Fierrez","Javier Ortega-Garcia","Marcos Grande","Iñigo Puente","Jorge Cordova","Gonzalo Cordova"],"pdf_url":"https://arxiv.org/pdf/2306.10046v2.pdf","comment":"Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for\n  Document Analysis"},{"id":"http://arxiv.org/abs/2308.04086v1","updated":"2023-08-08T06:58:05Z","published":"2023-08-08T06:58:05Z","title":"Understanding and Modeling Passive-Negative Feedback for Short-video\n  Sequential Recommendation","summary":"  Sequential recommendation is one of the most important tasks in recommender\nsystems, which aims to recommend the next interacted item with historical\nbehaviors as input. Traditional sequential recommendation always mainly\nconsiders the collected positive feedback such as click, purchase, etc.\nHowever, in short-video platforms such as TikTok, video viewing behavior may\nnot always represent positive feedback. Specifically, the videos are played\nautomatically, and users passively receive the recommended videos. In this new\nscenario, users passively express negative feedback by skipping over videos\nthey do not like, which provides valuable information about their preferences.\nDifferent from the negative feedback studied in traditional recommender\nsystems, this passive-negative feedback can reflect users' interests and serve\nas an important supervision signal in extracting users' preferences. Therefore,\nit is essential to carefully design and utilize it in this novel recommendation\nscenario. In this work, we first conduct analyses based on a large-scale\nreal-world short-video behavior dataset and illustrate the significance of\nleveraging passive feedback. We then propose a novel method that deploys the\nsub-interest encoder, which incorporates positive feedback and passive-negative\nfeedback as supervision signals to learn the user's current active\nsub-interest. Moreover, we introduce an adaptive fusion layer to integrate\nvarious sub-interests effectively. To enhance the robustness of our model, we\nthen introduce a multi-task learning module to simultaneously optimize two\nkinds of feedback -- passive-negative feedback and traditional randomly-sampled\nnegative feedback. The experiments on two large-scale datasets verify that the\nproposed method can significantly outperform state-of-the-art approaches. The\ncode is released at https://github.com/tsinghua-fib-lab/RecSys2023-SINE.\n","authors":["Yunzhu Pan","Chen Gao","Jianxin Chang","Yanan Niu","Yang Song","Kun Gai","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.04086v1.pdf","comment":"Accepted by RecSys'23"},{"id":"http://arxiv.org/abs/2206.12893v3","updated":"2023-08-08T06:40:15Z","published":"2022-06-26T14:51:18Z","title":"PCDF: A Parallel-Computing Distributed Framework for Sponsored Search\n  Advertising Serving","summary":"  Traditional online advertising systems for sponsored search follow a cascade\nparadigm with retrieval, pre-ranking,ranking, respectively. Constrained by\nstrict requirements on online inference efficiency, it tend to be difficult to\ndeploy useful but computationally intensive modules in the ranking stage.\nMoreover, ranking models currently used in the industry assume the user click\nonly relies on the advertisements itself, which results in the ranking stage\noverlooking the impact of organic search results on the predicted\nadvertisements (ads). In this work, we propose a novel framework\nPCDF(Parallel-Computing Distributed Framework), allowing to split the\ncomputation cost into three parts and to deploy them in the pre-module in\nparallel with the retrieval stage, the middle-module for ranking ads, and the\npost-module for re-ranking ads with external items. Our PCDF effectively\nreduces the overall inference latency compared with the classic framework. The\nwhole module is end-to-end offline training and adapt for the online learning\nparadigm. To our knowledge, we are the first to propose an end-to-end solution\nfor online training and deployment on complex CTR models from the system\nframework side.\n","authors":["Han Xu","Hao Qi","Kunyao Wang","Pei Wang","Guowei Zhang","Congcong Liu","Junsheng Jin","Xiwei Zhao","Zhangang Lin","Jinghe Hu","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2206.12893v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04067v1","updated":"2023-08-08T06:04:17Z","published":"2023-08-08T06:04:17Z","title":"Online Distillation-enhanced Multi-modal Transformer for Sequential\n  Recommendation","summary":"  Multi-modal recommendation systems, which integrate diverse types of\ninformation, have gained widespread attention in recent years. However,\ncompared to traditional collaborative filtering-based multi-modal\nrecommendation systems, research on multi-modal sequential recommendation is\nstill in its nascent stages. Unlike traditional sequential recommendation\nmodels that solely rely on item identifier (ID) information and focus on\nnetwork structure design, multi-modal recommendation models need to emphasize\nitem representation learning and the fusion of heterogeneous data sources. This\npaper investigates the impact of item representation learning on downstream\nrecommendation tasks and examines the disparities in information fusion at\ndifferent stages. Empirical experiments are conducted to demonstrate the need\nto design a framework suitable for collaborative learning and fusion of diverse\ninformation. Based on this, we propose a new model-agnostic framework for\nmulti-modal sequential recommendation tasks, called Online\nDistillation-enhanced Multi-modal Transformer (ODMT), to enhance feature\ninteraction and mutual learning among multi-source input (ID, text, and image),\nwhile avoiding conflicts among different features during training, thereby\nimproving recommendation accuracy. To be specific, we first introduce an\nID-aware Multi-modal Transformer module in the item representation learning\nstage to facilitate information interaction among different features. Secondly,\nwe employ an online distillation training strategy in the prediction\noptimization stage to make multi-source data learn from each other and improve\nprediction robustness. Experimental results on a video content recommendation\ndataset and three e-commerce recommendation datasets demonstrate the\neffectiveness of the proposed two modules, which is approximately 10%\nimprovement in performance compared to baseline models.\n","authors":["Wei Ji","Xiangyan Liu","An Zhang","Yinwei Wei","Yongxin Ni","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04067v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.04033v1","updated":"2023-08-08T04:21:14Z","published":"2023-08-08T04:21:14Z","title":"Adapting Foundation Models for Information Synthesis of Wireless\n  Communication Specifications","summary":"  Existing approaches to understanding, developing and researching modern\nwireless communication technologies involves time-intensive and arduous process\nof sifting through numerous webpages and technical specification documents,\ngathering the required information and synthesizing it. This paper presents\nNextGen Communications Copilot, a conversational artificial intelligence tool\nfor information synthesis of wireless communication specifications. The system\nbuilds on top of recent advancements in foundation models and consists of three\nkey additional components: a domain-specific database, a context extractor, and\na feedback mechanism. The system appends user queries with concise and\nquery-dependent contextual information extracted from a database of wireless\ntechnical specifications and incorporates tools for expert feedback and data\ncontributions. On evaluation using a benchmark dataset of queries and reference\nresponses created by subject matter experts, the system demonstrated more\nrelevant and accurate answers with an average BLEU score and BERTScore\nF1-measure of 0.37 and 0.79 respectively compared to the corresponding values\nof 0.07 and 0.59 achieved by state-of-the-art tools like ChatGPT.\n","authors":["Manikanta Kotaru"],"pdf_url":"https://arxiv.org/pdf/2308.04033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04028v1","updated":"2023-08-08T04:06:11Z","published":"2023-08-08T04:06:11Z","title":"Top K Relevant Passage Retrieval for Biomedical Question Answering","summary":"  Question answering is a task that answers factoid questions using a large\ncollection of documents. It aims to provide precise answers in response to the\nuser's questions in natural language. Question answering relies on efficient\npassage retrieval to select candidate contexts, where traditional sparse vector\nspace models, such as TF-IDF or BM25, are the de facto method. On the web,\nthere is no single article that could provide all the possible answers\navailable on the internet to the question of the problem asked by the user. The\nexisting Dense Passage Retrieval model has been trained on Wikipedia dump from\nDec. 20, 2018, as the source documents for answering questions. Question\nanswering (QA) has made big strides with several open-domain and machine\ncomprehension systems built using large-scale annotated datasets. However, in\nthe clinical domain, this problem remains relatively unexplored. According to\nmultiple surveys, Biomedical Questions cannot be answered correctly from\nWikipedia Articles. In this work, we work on the existing DPR framework for the\nbiomedical domain and retrieve answers from the Pubmed articles which is a\nreliable source to answer medical questions. When evaluated on a BioASQ QA\ndataset, our fine-tuned dense retriever results in a 0.81 F1 score.\n","authors":["Shashank Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.04028v1.pdf","comment":"6 pages, 5 figures. arXiv admin note: text overlap with\n  arXiv:2004.04906 by other authors"},{"id":"http://arxiv.org/abs/2308.04019v1","updated":"2023-08-08T03:33:15Z","published":"2023-08-08T03:33:15Z","title":"Exploring the Spatiotemporal Features of Online Food Recommendation\n  Service","summary":"  Online Food Recommendation Service (OFRS) has remarkable spatiotemporal\ncharacteristics and the advantage of being able to conveniently satisfy users'\nneeds in a timely manner. There have been a variety of studies that have begun\nto explore its spatiotemporal properties, but a comprehensive and in-depth\nanalysis of the OFRS spatiotemporal features is yet to be conducted. Therefore,\nthis paper studies the OFRS based on three questions: how spatiotemporal\nfeatures play a role; why self-attention cannot be used to model the\nspatiotemporal sequences of OFRS; and how to combine spatiotemporal features to\nimprove the efficiency of OFRS. Firstly, through experimental analysis, we\nsystemically extracted the spatiotemporal features of OFRS, identified the most\nvaluable features and designed an effective combination method. Secondly, we\nconducted a detailed analysis of the spatiotemporal sequences, which revealed\nthe shortcomings of self-attention in OFRS, and proposed a more optimized\nspatiotemporal sequence method for replacing self-attention. In addition, we\nalso designed a Dynamic Context Adaptation Model to further improve the\nefficiency and performance of OFRS. Through the offline experiments on two\nlarge datasets and online experiments for a week, the feasibility and\nsuperiority of our model were proven.\n","authors":["Shaochuan Lin","Jiayan Pei","Taotao Zhou","Hengxu He","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.04019v1.pdf","comment":"accepted by SIGIR 2023"},{"id":"http://arxiv.org/abs/2308.04017v1","updated":"2023-08-08T03:24:44Z","published":"2023-08-08T03:24:44Z","title":"Multi-Granularity Attention Model for Group Recommendation","summary":"  Group recommendation provides personalized recommendations to a group of\nusers based on their shared interests, preferences, and characteristics.\nCurrent studies have explored different methods for integrating individual\npreferences and making collective decisions that benefit the group as a whole.\nHowever, most of them heavily rely on users with rich behavior and ignore\nlatent preferences of users with relatively sparse behavior, leading to\ninsufficient learning of individual interests. To address this challenge, we\npresent the Multi-Granularity Attention Model (MGAM), a novel approach that\nutilizes multiple levels of granularity (i.e., subsets, groups, and supersets)\nto uncover group members' latent preferences and mitigate recommendation noise.\nSpecially, we propose a Subset Preference Extraction module that enhances the\nrepresentation of users' latent subset-level preferences by incorporating their\nprevious interactions with items and utilizing a hierarchical mechanism.\nAdditionally, our method introduces a Group Preference Extraction module and a\nSuperset Preference Extraction module, which explore users' latent preferences\non two levels: the group-level, which maintains users' original preferences,\nand the superset-level, which includes group-group exterior information. By\nincorporating the subset-level embedding, group-level embedding, and\nsuperset-level embedding, our proposed method effectively reduces group\nrecommendation noise across multiple granularities and comprehensively learns\nindividual interests. Extensive offline and online experiments have\ndemonstrated the superiority of our method in terms of performance.\n","authors":["Jianye Ji","Jiayan Pei","Shaochuan Lin","Taotao Zhou","Hengxu He","Jia Jia","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.04017v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.04431v1","updated":"2023-08-08T17:58:45Z","published":"2023-08-08T17:58:45Z","title":"When More is Less: Incorporating Additional Datasets Can Hurt\n  Performance By Introducing Spurious Correlations","summary":"  In machine learning, incorporating more data is often seen as a reliable\nstrategy for improving model performance; this work challenges that notion by\ndemonstrating that the addition of external datasets in many cases can hurt the\nresulting model's performance. In a large-scale empirical study across\ncombinations of four different open-source chest x-ray datasets and 9 different\nlabels, we demonstrate that in 43% of settings, a model trained on data from\ntwo hospitals has poorer worst group accuracy over both hospitals than a model\ntrained on just a single hospital's data. This surprising result occurs even\nthough the added hospital makes the training distribution more similar to the\ntest distribution. We explain that this phenomenon arises from the spurious\ncorrelation that emerges between the disease and hospital, due to\nhospital-specific image artifacts. We highlight the trade-off one encounters\nwhen training on multiple datasets, between the obvious benefit of additional\ndata and insidious cost of the introduced spurious correlation. In some cases,\nbalancing the dataset can remove the spurious correlation and improve\nperformance, but it is not always an effective strategy. We contextualize our\nresults within the literature on spurious correlations to help explain these\noutcomes. Our experiments underscore the importance of exercising caution when\nselecting training data for machine learning models, especially in settings\nwhere there is a risk of spurious correlations such as with medical imaging.\nThe risks outlined highlight the need for careful data selection and model\nevaluation in future research and practice.\n","authors":["Rhys Compton","Lily Zhang","Aahlad Puli","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2308.04431v1.pdf","comment":"Accepted at MLHC 2023"},{"id":"http://arxiv.org/abs/2308.04430v1","updated":"2023-08-08T17:58:15Z","published":"2023-08-08T17:58:15Z","title":"SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore","summary":"  The legality of training language models (LMs) on copyrighted or otherwise\nrestricted data is under intense debate. However, as we show, model performance\nsignificantly degrades if trained only on low-risk text (e.g., out-of-copyright\nbooks or government documents), due to its limited size and domain coverage. We\npresent SILO, a new language model that manages this risk-performance tradeoff\nduring inference. SILO is built by (1) training a parametric LM on Open License\nCorpus (OLC), a new corpus we curate with 228B tokens of public domain and\npermissively licensed text and (2) augmenting it with a more general and easily\nmodifiable nonparametric datastore (e.g., containing copyrighted books or news)\nthat is only queried during inference. The datastore allows use of high-risk\ndata without training on it, supports sentence-level data attribution, and\nenables data producers to opt out from the model by removing content from the\nstore. These capabilities can foster compliance with data-use regulations such\nas the fair use doctrine in the United States and the GDPR in the European\nUnion. Our experiments show that the parametric LM struggles on domains not\ncovered by OLC. However, access to the datastore greatly improves out of domain\nperformance, closing 90% of the performance gap with an LM trained on the Pile,\na more diverse corpus with mostly high-risk text. We also analyze which\nnonparametric approach works best, where the remaining errors lie, and how\nperformance scales with datastore size. Our results suggest that it is possible\nto build high quality language models while mitigating their legal risk.\n","authors":["Sewon Min","Suchin Gururangan","Eric Wallace","Hannaneh Hajishirzi","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2308.04430v1.pdf","comment":"27 pages; 6 figures. Code, models, and data available at\n  https://github.com/kernelmachine/silo-lm"},{"id":"http://arxiv.org/abs/2308.04428v1","updated":"2023-08-08T17:56:20Z","published":"2023-08-08T17:56:20Z","title":"Meta-Learning Operators to Optimality from Multi-Task Non-IID Data","summary":"  A powerful concept behind much of the recent progress in machine learning is\nthe extraction of common features across data from heterogeneous sources or\ntasks. Intuitively, using all of one's data to learn a common representation\nfunction benefits both computational effort and statistical generalization by\nleaving a smaller number of parameters to fine-tune on a given task. Toward\ntheoretically grounding these merits, we propose a general setting of\nrecovering linear operators $M$ from noisy vector measurements $y = Mx + w$,\nwhere the covariates $x$ may be both non-i.i.d. and non-isotropic. We\ndemonstrate that existing isotropy-agnostic meta-learning approaches incur\nbiases on the representation update, which causes the scaling of the noise\nterms to lose favorable dependence on the number of source tasks. This in turn\ncan cause the sample complexity of representation learning to be bottlenecked\nby the single-task data size. We introduce an adaptation, $\\texttt{De-bias &\nFeature-Whiten}$ ($\\texttt{DFW}$), of the popular alternating\nminimization-descent (AMD) scheme proposed in Collins et al., (2021), and\nestablish linear convergence to the optimal representation with noise level\nscaling down with the $\\textit{total}$ source data size. This leads to\ngeneralization bounds on the same order as an oracle empirical risk minimizer.\nWe verify the vital importance of $\\texttt{DFW}$ on various numerical\nsimulations. In particular, we show that vanilla alternating-minimization\ndescent fails catastrophically even for iid, but mildly non-isotropic data. Our\nanalysis unifies and generalizes prior work, and provides a flexible framework\nfor a wider range of applications, such as in controls and dynamical systems.\n","authors":["Thomas T. C. K. Zhang","Leonardo F. Toso","James Anderson","Nikolai Matni"],"pdf_url":"https://arxiv.org/pdf/2308.04428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04426v1","updated":"2023-08-08T17:55:30Z","published":"2023-08-08T17:55:30Z","title":"A Deep-Learning Method Using Auto-encoder and Generative Adversarial\n  Network for Anomaly Detection on Ancient Stone Stele Surfaces","summary":"  Accurate detection of natural deterioration and man-made damage on the\nsurfaces of ancient stele in the first instance is essential for their\npreventive conservation. Existing methods for cultural heritage preservation\nare not able to achieve this goal perfectly due to the difficulty of balancing\naccuracy, efficiency, timeliness, and cost. This paper presents a deep-learning\nmethod to automatically detect above mentioned emergencies on ancient stone\nstele in real time, employing autoencoder (AE) and generative adversarial\nnetwork (GAN). The proposed method overcomes the limitations of existing\nmethods by requiring no extensive anomaly samples while enabling comprehensive\ndetection of unpredictable anomalies. the method includes stages of monitoring,\ndata acquisition, pre-processing, model structuring, and post-processing.\nTaking the Longmen Grottoes' stone steles as a case study, an unsupervised\nlearning model based on AE and GAN architectures is proposed and validated with\na reconstruction accuracy of 99.74\\%. The method's evaluation revealed the\nproficient detection of seven artificially designed anomalies and demonstrated\nprecision and reliability without false alarms. This research provides novel\nideas and possibilities for the application of deep learning in the field of\ncultural heritage.\n","authors":["Yikun Liu","Yuning Wang","Cheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07774v3","updated":"2023-08-08T17:48:12Z","published":"2022-09-19T22:57:10Z","title":"Learning To Rank Diversely At Airbnb","summary":"  Airbnb is a two-sided marketplace, bringing together hosts who own listings\nfor rent, with prospective guests from around the globe. Applying neural\nnetwork-based learning to rank techniques has led to significant improvements\nin matching guests with hosts. These improvements in ranking were driven by a\ncore strategy: order the listings by their estimated booking probabilities,\nthen iterate on techniques to make these booking probability estimates more and\nmore accurate. Embedded implicitly in this strategy was an assumption that the\nbooking probability of a listing could be determined independently of other\nlistings in search results. In this paper we discuss how this assumption,\npervasive throughout the commonly-used learning to rank frameworks, is false.\nWe provide a theoretical foundation correcting this assumption, followed by\nefficient neural network architectures based on the theory. Explicitly\naccounting for possible similarities between listings, and reducing them to\ndiversify the search results generated strong positive impact. We discuss these\nmetric wins as part of the online A/B tests of the theory. Our method provides\na practical way to diversify search results for large-scale production ranking\nsystems.\n","authors":["Malay Haldar","Mustafa Abdool","Liwei He","Dillon Davis","Huiji Gao","Sanjeev Katariya"],"pdf_url":"https://arxiv.org/pdf/2210.07774v3.pdf","comment":"Search ranking, Diversity, e-commerce"},{"id":"http://arxiv.org/abs/2308.04417v1","updated":"2023-08-08T17:34:28Z","published":"2023-08-08T17:34:28Z","title":"DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from\n  Optical Satellite Images","summary":"  Optical satellite images are a critical data source; however, cloud cover\noften compromises their quality, hindering image applications and analysis.\nConsequently, effectively removing clouds from optical satellite images has\nemerged as a prominent research direction. While recent advancements in cloud\nremoval primarily rely on generative adversarial networks, which may yield\nsuboptimal image quality, diffusion models have demonstrated remarkable success\nin diverse image-generation tasks, showcasing their potential in addressing\nthis challenge. This paper presents a novel framework called DiffCR, which\nleverages conditional guided diffusion with deep convolutional networks for\nhigh-performance cloud removal for optical satellite imagery. Specifically, we\nintroduce a decoupled encoder for conditional image feature extraction,\nproviding a robust color representation to ensure the close similarity of\nappearance information between the conditional input and the synthesized\noutput. Moreover, we propose a novel and efficient time and condition fusion\nblock within the cloud removal model to accurately simulate the correspondence\nbetween the appearance in the conditional image and the target image at a low\ncomputational cost. Extensive experimental evaluations on two commonly used\nbenchmark datasets demonstrate that DiffCR consistently achieves\nstate-of-the-art performance on all metrics, with parameter and computational\ncomplexities amounting to only 5.1% and 5.4%, respectively, of those previous\nbest methods. The source code, pre-trained models, and all the experimental\nresults will be publicly available at https://github.com/XavierJiezou/DiffCR\nupon the paper's acceptance of this work.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Yu Zhang","Shiying Wang","Lei Jin","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.04417v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.09345v2","updated":"2023-08-08T17:26:58Z","published":"2023-06-15T17:59:51Z","title":"Evaluating Data Attribution for Text-to-Image Models","summary":"  While large text-to-image models are able to synthesize \"novel\" images, these\nimages are necessarily a reflection of the training data. The problem of data\nattribution in such models -- which of the images in the training set are most\nresponsible for the appearance of a given generated image -- is a difficult yet\nimportant one. As an initial step toward this problem, we evaluate attribution\nthrough \"customization\" methods, which tune an existing large-scale model\ntoward a given exemplar object or style. Our key insight is that this allows us\nto efficiently create synthetic images that are computationally influenced by\nthe exemplar by construction. With our new dataset of such exemplar-influenced\nimages, we are able to evaluate various data attribution algorithms and\ndifferent possible feature spaces. Furthermore, by training on our dataset, we\ncan tune standard models, such as DINO, CLIP, and ViT, toward the attribution\nproblem. Even though the procedure is tuned towards small exemplar sets, we\nshow generalization to larger sets. Finally, by taking into account the\ninherent uncertainty of the problem, we can assign soft attribution scores over\na set of training images.\n","authors":["Sheng-Yu Wang","Alexei A. Efros","Jun-Yan Zhu","Richard Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09345v2.pdf","comment":"Updated v2 -- ICCV 2023 camera ready version. Project page:\n  https://peterwang512.github.io/GenDataAttribution Code:\n  https://github.com/PeterWang512/GenDataAttribution"},{"id":"http://arxiv.org/abs/2308.04412v1","updated":"2023-08-08T17:18:04Z","published":"2023-08-08T17:18:04Z","title":"Probabilistic Invariant Learning with Randomized Linear Classifiers","summary":"  Designing models that are both expressive and preserve known invariances of\ntasks is an increasingly hard problem. Existing solutions tradeoff invariance\nfor computational or memory resources. In this work, we show how to leverage\nrandomness and design models that are both expressive and invariant but use\nless resources. Inspired by randomized algorithms, our key insight is that\naccepting probabilistic notions of universal approximation and invariance can\nreduce our resource requirements. More specifically, we propose a class of\nbinary classification models called Randomized Linear Classifiers (RLCs). We\ngive parameter and sample size conditions in which RLCs can, with high\nprobability, approximate any (smooth) function while preserving invariance to\ncompact group transformations. Leveraging this result, we design three RLCs\nthat are provably probabilistic invariant for classification tasks over sets,\ngraphs, and spherical data. We show how these models can achieve probabilistic\ninvariance and universality using less resources than (deterministic) neural\nnetworks and their invariant counterparts. Finally, we empirically demonstrate\nthe benefits of this new class of models on invariant tasks where deterministic\ninvariant neural networks are known to struggle.\n","authors":["Leonardo Cotta","Gal Yehuda","Assaf Schuster","Chris J. Maddison"],"pdf_url":"https://arxiv.org/pdf/2308.04412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04406v1","updated":"2023-08-08T17:10:23Z","published":"2023-08-08T17:10:23Z","title":"XGBD: Explanation-Guided Graph Backdoor Detection","summary":"  Backdoor attacks pose a significant security risk to graph learning models.\nBackdoors can be embedded into the target model by inserting backdoor triggers\ninto the training dataset, causing the model to make incorrect predictions when\nthe trigger is present. To counter backdoor attacks, backdoor detection has\nbeen proposed. An emerging detection strategy in the vision and NLP domains is\nbased on an intriguing phenomenon: when training models on a mixture of\nbackdoor and clean samples, the loss on backdoor samples drops significantly\nfaster than on clean samples, allowing backdoor samples to be easily detected\nby selecting samples with the lowest loss values. However, the ignorance of\ntopological feature information on graph data limits its detection\neffectiveness when applied directly to the graph domain. To this end, we\npropose an explanation-guided backdoor detection method to take advantage of\nthe topological information. Specifically, we train a helper model on the graph\ndataset, feed graph samples into the model, and then adopt explanation methods\nto attribute model prediction to an important subgraph. We observe that\nbackdoor samples have distinct attribution distribution than clean samples, so\nthe explanatory subgraph could serve as more discriminative features for\ndetecting backdoor samples. Comprehensive experiments on multiple popular\ndatasets and attack methods demonstrate the effectiveness and explainability of\nour method. Our code is available:\nhttps://github.com/GuanZihan/GNN_backdoor_detection.\n","authors":["Zihan Guan","Mengnan Du","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04406v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.04396v1","updated":"2023-08-08T17:00:30Z","published":"2023-08-08T17:00:30Z","title":"Event Abstraction for Enterprise Collaboration Systems to Support Social\n  Process Mining","summary":"  One aim of Process Mining (PM) is the discovery of process models from event\nlogs of information systems. PM has been successfully applied to\nprocess-oriented enterprise systems but is less suited for communication- and\ndocument-oriented Enterprise Collaboration Systems (ECS). ECS event logs are\nvery fine-granular and PM applied to their logs results in spaghetti models. A\ncommon solution for this is event abstraction, i.e., converting low-level logs\ninto more abstract high-level logs before running discovery algorithms. ECS\nlogs come with special characteristics that have so far not been fully\naddressed by existing event abstraction approaches. We aim to close this gap\nwith a tailored ECS event abstraction (ECSEA) approach that trains a model by\ncomparing recorded actual user activities (high-level traces) with the\nsystem-generated low-level traces (extracted from the ECS). The model allows us\nto automatically convert future low-level traces into an abstracted high-level\nlog that can be used for PM. Our evaluation shows that the algorithm produces\naccurate results. ECSEA is a preprocessing method that is essential for the\ninterpretation of collaborative work activity in ECS, which we call Social\nProcess Mining.\n","authors":["Jonas Blatt","Patrick Delfmann","Petra Schubert"],"pdf_url":"https://arxiv.org/pdf/2308.04396v1.pdf","comment":"8 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2308.04395v1","updated":"2023-08-08T17:00:11Z","published":"2023-08-08T17:00:11Z","title":"Data Augmentation-Based Unsupervised Domain Adaptation In Medical\n  Imaging","summary":"  Deep learning-based models in medical imaging often struggle to generalize\neffectively to new scans due to data heterogeneity arising from differences in\nhardware, acquisition parameters, population, and artifacts. This limitation\npresents a significant challenge in adopting machine learning models for\nclinical practice. We propose an unsupervised method for robust domain\nadaptation in brain MRI segmentation by leveraging MRI-specific augmentation\ntechniques. To evaluate the effectiveness of our method, we conduct extensive\nexperiments across diverse datasets, modalities, and segmentation tasks,\ncomparing against the state-of-the-art methods. The results show that our\nproposed approach achieves high accuracy, exhibits broad applicability, and\nshowcases remarkable robustness against domain shift in various tasks,\nsurpassing the state-of-the-art performance in the majority of cases.\n","authors":["Sebastian Nørgaard Llambias","Mads Nielsen","Mostafa Mehdipour Ghazi"],"pdf_url":"https://arxiv.org/pdf/2308.04395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04375v1","updated":"2023-08-08T16:23:46Z","published":"2023-08-08T16:23:46Z","title":"Understanding the Effect of Counterfactual Explanations on Trust and\n  Reliance on AI for Human-AI Collaborative Clinical Decision Making","summary":"  Artificial intelligence (AI) is increasingly being considered to assist human\ndecision-making in high-stake domains (e.g. health). However, researchers have\ndiscussed an issue that humans can over-rely on wrong suggestions of the AI\nmodel instead of achieving human AI complementary performance. In this work, we\nutilized salient feature explanations along with what-if, counterfactual\nexplanations to make humans review AI suggestions more analytically to reduce\noverreliance on AI and explored the effect of these explanations on trust and\nreliance on AI during clinical decision-making. We conducted an experiment with\nseven therapists and ten laypersons on the task of assessing post-stroke\nsurvivors' quality of motion, and analyzed their performance, agreement level\non the task, and reliance on AI without and with two types of AI explanations.\nOur results showed that the AI model with both salient features and\ncounterfactual explanations assisted therapists and laypersons to improve their\nperformance and agreement level on the task when `right' AI outputs are\npresented. While both therapists and laypersons over-relied on `wrong' AI\noutputs, counterfactual explanations assisted both therapists and laypersons to\nreduce their over-reliance on `wrong' AI outputs by 21\\% compared to salient\nfeature explanations. Specifically, laypersons had higher performance degrades\nby 18.0 f1-score with salient feature explanations and 14.0 f1-score with\ncounterfactual explanations than therapists with performance degrades of 8.6\nand 2.8 f1-scores respectively. Our work discusses the potential of\ncounterfactual explanations to better estimate the accuracy of an AI model and\nreduce over-reliance on `wrong' AI outputs and implications for improving\nhuman-AI collaborative decision-making.\n","authors":["Min Hun Lee","Chong Jun Chew"],"pdf_url":"https://arxiv.org/pdf/2308.04375v1.pdf","comment":"ACM CSCW 2023"},{"id":"http://arxiv.org/abs/2308.04373v1","updated":"2023-08-08T16:22:44Z","published":"2023-08-08T16:22:44Z","title":"Pelta: Shielding Transformers to Mitigate Evasion Attacks in Federated\n  Learning","summary":"  The main premise of federated learning is that machine learning model updates\nare computed locally, in particular to preserve user data privacy, as those\nnever leave the perimeter of their device. This mechanism supposes the general\nmodel, once aggregated, to be broadcast to collaborating and non malicious\nnodes. However, without proper defenses, compromised clients can easily probe\nthe model inside their local memory in search of adversarial examples. For\ninstance, considering image-based applications, adversarial examples consist of\nimperceptibly perturbed images (to the human eye) misclassified by the local\nmodel, which can be later presented to a victim node's counterpart model to\nreplicate the attack. To mitigate such malicious probing, we introduce Pelta, a\nnovel shielding mechanism leveraging trusted hardware. By harnessing the\ncapabilities of Trusted Execution Environments (TEEs), Pelta masks part of the\nback-propagation chain rule, otherwise typically exploited by attackers for the\ndesign of malicious samples. We evaluate Pelta on a state of the art ensemble\nmodel and demonstrate its effectiveness against the Self Attention Gradient\nadversarial Attack.\n","authors":["Simon Queyrut","Yérom-David Bromberg","Valerio Schiavoni"],"pdf_url":"https://arxiv.org/pdf/2308.04373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06713v2","updated":"2023-08-08T16:21:49Z","published":"2023-07-13T12:11:36Z","title":"Unsupervised Calibration through Prior Adaptation for Text\n  Classification using Large Language Models","summary":"  A wide variety of natural language tasks are currently being addressed with\nlarge-scale language models (LLMs). These models are usually trained with a\nvery large amount of unsupervised text data and adapted to perform a downstream\nnatural language task using methods like fine-tuning, calibration or in-context\nlearning. In this work, we propose an approach to adapt the prior class\ndistribution to perform text classification tasks without the need for labelled\nsamples and only few in-domain sample queries. The proposed approach treats the\nLLM as a black box, adding a stage where the model posteriors are calibrated to\nthe task. Results show that these methods outperform the un-adapted model for\ndifferent number of training shots in the prompt and a previous approach were\ncalibration is performed without using any adaptation data.\n","authors":["Lautaro Estienne"],"pdf_url":"https://arxiv.org/pdf/2307.06713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03735v2","updated":"2023-08-08T16:20:18Z","published":"2023-08-07T17:34:58Z","title":"Randomized algorithms for precise measurement of differentially-private,\n  personalized recommendations","summary":"  Personalized recommendations form an important part of today's internet\necosystem, helping artists and creators to reach interested users, and helping\nusers to discover new and engaging content. However, many users today are\nskeptical of platforms that personalize recommendations, in part due to\nhistorically careless treatment of personal data and data privacy. Now,\nbusinesses that rely on personalized recommendations are entering a new\nparadigm, where many of their systems must be overhauled to be privacy-first.\nIn this article, we propose an algorithm for personalized recommendations that\nfacilitates both precise and differentially-private measurement. We consider\nadvertising as an example application, and conduct offline experiments to\nquantify how the proposed privacy-preserving algorithm affects key metrics\nrelated to user experience, advertiser value, and platform revenue compared to\nthe extremes of both (private) non-personalized and non-private, personalized\nimplementations.\n","authors":["Allegra Laro","Yanqing Chen","Hao He","Babak Aghazadeh"],"pdf_url":"https://arxiv.org/pdf/2308.03735v2.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2305.19259v3","updated":"2023-08-08T16:05:55Z","published":"2023-05-30T17:47:27Z","title":"Shuffle SGD is Always Better than SGD: Improved Analysis of SGD with\n  Arbitrary Data Orders","summary":"  Stochastic Gradient Descent (SGD) algorithms are widely used in optimizing\nneural networks, with Random Reshuffling (RR) and Single Shuffle (SS) being\npopular choices for cycling through random or single permutations of the\ntraining data. However, the convergence properties of these algorithms in the\nnon-convex case are not fully understood. Existing results suggest that, in\nrealistic training scenarios where the number of epochs is smaller than the\ntraining set size, RR may perform worse than SGD.\n  In this paper, we analyze a general SGD algorithm that allows for arbitrary\ndata orderings and show improved convergence rates for non-convex functions.\nSpecifically, our analysis reveals that SGD with random and single shuffling is\nalways faster or at least as good as classical SGD with replacement, regardless\nof the number of iterations. Overall, our study highlights the benefits of\nusing SGD with random/single shuffling and provides new insights into its\nconvergence properties for non-convex optimization.\n","authors":["Anastasia Koloskova","Nikita Doikov","Sebastian U. Stich","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2305.19259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03571v2","updated":"2023-08-08T16:05:01Z","published":"2023-07-07T13:06:12Z","title":"Smoothing the Edges: A General Framework for Smooth Optimization in\n  Sparse Regularization using Hadamard Overparametrization","summary":"  This paper presents a framework for smooth optimization of objectives with\n$\\ell_q$ and $\\ell_{p,q}$ regularization for (structured) sparsity. Finding\nsolutions to these non-smooth and possibly non-convex problems typically relies\non specialized optimization routines. In contrast, the method studied here is\ncompatible with off-the-shelf (stochastic) gradient descent that is ubiquitous\nin deep learning, thereby enabling differentiable sparse regularization without\napproximations. The proposed optimization transfer comprises an\noverparametrization of selected model parameters followed by a change of\npenalties. In the overparametrized problem, smooth and convex $\\ell_2$\nregularization induces non-smooth and non-convex regularization in the original\nparametrization. We show that the resulting surrogate problem not only has an\nidentical global optimum but also exactly preserves the local minima. This is\nparticularly useful in non-convex regularization, where finding global\nsolutions is NP-hard and local minima often generalize well. We provide an\nintegrative overview that consolidates various literature strands on\nsparsity-inducing parametrizations in a general setting and meaningfully extend\nexisting approaches. The feasibility of our approach is evaluated through\nnumerical experiments, demonstrating its effectiveness by matching or\noutperforming common implementations of convex and non-convex regularizers.\n","authors":["Chris Kolb","Christian L. Müller","Bernd Bischl","David Rügamer"],"pdf_url":"https://arxiv.org/pdf/2307.03571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v1","updated":"2023-08-08T16:04:42Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n  Learner Equation Modeling","summary":"  Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16565v2","updated":"2023-08-08T16:01:41Z","published":"2023-03-29T09:47:48Z","title":"PMAA: A Progressive Multi-scale Attention Autoencoder Model for\n  High-performance Cloud Removal from Multi-temporal Satellite Imagery","summary":"  Satellite imagery analysis plays a pivotal role in remote sensing; however,\ninformation loss due to cloud cover significantly impedes its application.\nAlthough existing deep cloud removal models have achieved notable outcomes,\nthey scarcely consider contextual information. This study introduces a\nhigh-performance cloud removal architecture, termed Progressive Multi-scale\nAttention Autoencoder (PMAA), which concurrently harnesses global and local\ninformation to construct robust contextual dependencies using a novel\nMulti-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).\nPMAA establishes long-range dependencies of multi-scale features using MAM and\nmodulates the reconstruction of fine-grained details utilizing LIM, enabling\nsimultaneous representation of fine- and coarse-grained features at the same\nlevel. With the help of diverse and multi-scale features, PMAA consistently\noutperforms the previous state-of-the-art model CTGAN on two benchmark\ndatasets. Moreover, PMAA boasts considerable efficiency advantages, with only\n0.5% and 14.6% of the parameters and computational complexity of CTGAN,\nrespectively. These comprehensive results underscore PMAA's potential as a\nlightweight cloud removal network suitable for deployment on edge devices to\naccomplish large-scale cloud removal tasks. Our source code and pre-trained\nmodels are available at https://github.com/XavierJiezou/PMAA.\n","authors":["Xuechao Zou","Kai Li","Junliang Xing","Pin Tao","Yachao Cui"],"pdf_url":"https://arxiv.org/pdf/2303.16565v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.04341v1","updated":"2023-08-08T15:38:55Z","published":"2023-08-08T15:38:55Z","title":"Accurate, Explainable, and Private Models: Providing Recourse While\n  Minimizing Training Data Leakage","summary":"  Machine learning models are increasingly utilized across impactful domains to\npredict individual outcomes. As such, many models provide algorithmic recourse\nto individuals who receive negative outcomes. However, recourse can be\nleveraged by adversaries to disclose private information. This work presents\nthe first attempt at mitigating such attacks. We present two novel methods to\ngenerate differentially private recourse: Differentially Private Model (DPM)\nand Laplace Recourse (LR). Using logistic regression classifiers and real world\nand synthetic datasets, we find that DPM and LR perform well in reducing what\nan adversary can infer, especially at low FPR. When training dataset size is\nlarge enough, we find particular success in preventing privacy leakage while\nmaintaining model and recourse accuracy with our novel LR method.\n","authors":["Catherine Huang","Chelse Swoopes","Christina Xiao","Jiaqi Ma","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2308.04341v1.pdf","comment":"Proceedings of The Second Workshop on New Frontiers in Adversarial\n  Machine Learning (AdvML-Frontiers @ ICML 2023)"},{"id":"http://arxiv.org/abs/2308.03629v2","updated":"2023-08-08T15:38:21Z","published":"2023-08-07T14:36:03Z","title":"MedMine: Examining Pre-trained Language Models on Medication Mining","summary":"  Automatic medication mining from clinical and biomedical text has become a\npopular topic due to its real impact on healthcare applications and the recent\ndevelopment of powerful language models (LMs). However, fully-automatic\nextraction models still face obstacles to be overcome such that they can be\ndeployed directly into clinical practice for better impacts. Such obstacles\ninclude their imbalanced performances on different entity types and clinical\nevents. In this work, we examine current state-of-the-art pre-trained language\nmodels (PLMs) on such tasks, via fine-tuning including the monolingual model\nMed7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their\nadvantages and drawbacks using historical medication mining shared task data\nsets from n2c2-2018 challenges. We report the findings we get from these\nfine-tuning experiments such that they can facilitate future research on\naddressing them, for instance, how to combine their outputs, merge such models,\nor improve their overall accuracy by ensemble learning and data augmentation.\nMedMine is part of the M3 Initiative \\url{https://github.com/HECTA-UoM/M3}\n","authors":["Haifa Alrdahi","Lifeng Han","Hendrik Šuvalov","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.03629v2.pdf","comment":"Open Research Project. 7 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2305.12522v2","updated":"2023-08-08T15:22:26Z","published":"2023-05-21T17:46:28Z","title":"P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic\n  Segmentation","summary":"  To mitigate the necessity for large amounts of supervised segmentation\nannotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)\nstrategies have been devised. These will often rely on advanced data and model\nregularization strategies to instigate the development of useful properties\n(e.g., prediction completeness and fidelity to semantic boundaries) in\nsegmentation priors, notwithstanding the lack of annotated information. In this\nwork, we first create a strong baseline by analyzing complementary WSSS\ntechniques and regularizing strategies, considering their strengths and\nlimitations. We then propose a new Class-specific Adversarial Erasing strategy,\ncomprising two adversarial CAM generating networks being gradually refined to\nproduce robust semantic segmentation proposals. Empirical results suggest that\nour approach induces substantial improvement in the effectiveness of the\nbaseline, resulting in a noticeable improvement over both Pascal VOC 2012 and\nMS COCO 2014 datasets.\n","authors":["Lucas David","Helio Pedrini","Zanoni Dias"],"pdf_url":"https://arxiv.org/pdf/2305.12522v2.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04332v1","updated":"2023-08-08T15:21:30Z","published":"2023-08-08T15:21:30Z","title":"RLHF-Blender: A Configurable Interactive Interface for Learning from\n  Diverse Human Feedback","summary":"  To use reinforcement learning from human feedback (RLHF) in practical\napplications, it is crucial to learn reward models from diverse sources of\nhuman feedback and to consider human factors involved in providing feedback of\ndifferent types. However, the systematic study of learning from diverse types\nof feedback is held back by limited standardized tooling available to\nresearchers. To bridge this gap, we propose RLHF-Blender, a configurable,\ninteractive interface for learning from human feedback. RLHF-Blender provides a\nmodular experimentation framework and implementation that enables researchers\nto systematically investigate the properties and qualities of human feedback\nfor reward learning. The system facilitates the exploration of various feedback\ntypes, including demonstrations, rankings, comparisons, and natural language\ninstructions, as well as studies considering the impact of human factors on\ntheir effectiveness. We discuss a set of concrete research opportunities\nenabled by RLHF-Blender. More information is available at\nhttps://rlhfblender.info/.\n","authors":["Yannick Metz","David Lindner","Raphaël Baur","Daniel Keim","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2308.04332v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.07873v3","updated":"2023-08-08T15:13:22Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding Adversarial\n  Transferability From Surrogate Training","summary":"  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v3.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n  pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2302.01075v5","updated":"2023-08-08T15:12:42Z","published":"2023-02-02T13:05:27Z","title":"MonoFlow: Rethinking Divergence GANs via the Perspective of Wasserstein\n  Gradient Flows","summary":"  The conventional understanding of adversarial training in generative\nadversarial networks (GANs) is that the discriminator is trained to estimate a\ndivergence, and the generator learns to minimize this divergence. We argue that\ndespite the fact that many variants of GANs were developed following this\nparadigm, the current theoretical understanding of GANs and their practical\nalgorithms are inconsistent. In this paper, we leverage Wasserstein gradient\nflows which characterize the evolution of particles in the sample space, to\ngain theoretical insights and algorithmic inspiration of GANs. We introduce a\nunified generative modeling framework - MonoFlow: the particle evolution is\nrescaled via a monotonically increasing mapping of the log density ratio. Under\nour framework, adversarial training can be viewed as a procedure first\nobtaining MonoFlow's vector field via training the discriminator and the\ngenerator learns to draw the particle flow defined by the corresponding vector\nfield. We also reveal the fundamental difference between variational divergence\nminimization and adversarial training. This analysis helps us to identify what\ntypes of generator loss functions can lead to the successful training of GANs\nand suggest that GANs may have more loss designs beyond the literature (e.g.,\nnon-saturated loss), as long as they realize MonoFlow. Consistent empirical\nstudies are included to validate the effectiveness of our framework.\n","authors":["Mingxuan Yi","Zhanxing Zhu","Song Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01075v5.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.04314v1","updated":"2023-08-08T15:02:50Z","published":"2023-08-08T15:02:50Z","title":"Cooperative Multi-agent Bandits: Distributed Algorithms with Optimal\n  Individual Regret and Constant Communication Costs","summary":"  Recently, there has been extensive study of cooperative multi-agent\nmulti-armed bandits where a set of distributed agents cooperatively play the\nsame multi-armed bandit game. The goal is to develop bandit algorithms with the\noptimal group and individual regrets and low communication between agents. The\nprior work tackled this problem using two paradigms: leader-follower and fully\ndistributed algorithms. Prior algorithms in both paradigms achieve the optimal\ngroup regret. The leader-follower algorithms achieve constant communication\ncosts but fail to achieve optimal individual regrets. The state-of-the-art\nfully distributed algorithms achieve optimal individual regrets but fail to\nachieve constant communication costs. This paper presents a simple yet\neffective communication policy and integrates it into a learning algorithm for\ncooperative bandits. Our algorithm achieves the best of both paradigms: optimal\nindividual regret and constant communication costs.\n","authors":["Lin Yang","Xuchuang Wang","Mohammad Hajiesmaili","Lijun Zhang","John C. S. Lui","Don Towsley"],"pdf_url":"https://arxiv.org/pdf/2308.04314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12344v2","updated":"2023-08-08T14:52:39Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2207.07271v3","updated":"2023-08-08T14:51:47Z","published":"2022-07-15T03:37:59Z","title":"Set-based value operators for non-stationary Markovian environments","summary":"  This paper analyzes finite state Markov Decision Processes (MDPs) with\nuncertain parameters in compact sets and re-examines results from robust MDP\nvia set-based fixed point theory. To this end, we generalize the Bellman and\npolicy evaluation operators to contracting operators on the value function\nspace and denote them as \\emph{value operators}. We lift these value operators\nto act on \\emph{sets} of value functions and denote them as \\emph{set-based\nvalue operators}. We prove that the set-based value operators are\n\\emph{contractions} in the space of compact value function sets. Leveraging\ninsights from set theory, we generalize the rectangularity condition in classic\nrobust MDP literature to a containment condition for all value operators, which\nis weaker and can be applied to a larger set of parameter-uncertain MDPs and\ncontracting operators in dynamic programming. We prove that both the\nrectangularity condition and the containment condition sufficiently ensure that\nthe set-based value operator's fixed point set contains its own extrema\nelements. For convex and compact sets of uncertain MDP parameters, we show\nequivalence between the classic robust value function and the supremum of the\nfixed point set of the set-based Bellman operator. Under dynamically changing\nMDP parameters in compact sets, we prove a set convergence result for value\niteration, which otherwise may not converge to a single value function.\nFinally, we derive novel guarantees for probabilistic path-planning problems in\nplanet exploration and stratospheric station-keeping.\n","authors":["Sarah H. Q. Li","Assalé Adjé","Pierre-Loïc Garoche","Behçet Açıkmeşe"],"pdf_url":"https://arxiv.org/pdf/2207.07271v3.pdf","comment":"17 pages, 11 figures, 1 table"},{"id":"http://arxiv.org/abs/2303.00500v2","updated":"2023-08-08T14:50:50Z","published":"2023-03-01T13:32:55Z","title":"Inherently Interpretable Multi-Label Classification Using Class-Specific\n  Counterfactuals","summary":"  Interpretability is essential for machine learning algorithms in high-stakes\napplication fields such as medical image analysis. However, high-performing\nblack-box neural networks do not provide explanations for their predictions,\nwhich can lead to mistrust and suboptimal human-ML collaboration. Post-hoc\nexplanation techniques, which are widely used in practice, have been shown to\nsuffer from severe conceptual problems. Furthermore, as we show in this paper,\ncurrent explanation techniques do not perform adequately in the multi-label\nscenario, in which multiple medical findings may co-occur in a single image. We\npropose Attri-Net, an inherently interpretable model for multi-label\nclassification. Attri-Net is a powerful classifier that provides transparent,\ntrustworthy, and human-understandable explanations. The model first generates\nclass-specific attribution maps based on counterfactuals to identify which\nimage regions correspond to certain medical findings. Then a simple logistic\nregression classifier is used to make predictions based solely on these\nattribution maps. We compare Attri-Net to five post-hoc explanation techniques\nand one inherently interpretable classifier on three chest X-ray datasets. We\nfind that Attri-Net produces high-quality multi-label explanations consistent\nwith clinical knowledge and has comparable classification performance to\nstate-of-the-art classification models.\n","authors":["Susu Sun","Stefano Woerner","Andreas Maier","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2303.00500v2.pdf","comment":"Accepted to MIDL 2023"},{"id":"http://arxiv.org/abs/2308.04304v1","updated":"2023-08-08T14:50:05Z","published":"2023-08-08T14:50:05Z","title":"The Model Inversion Eavesdropping Attack in Semantic Communication\n  Systems","summary":"  In recent years, semantic communication has been a popular research topic for\nits superiority in communication efficiency. As semantic communication relies\non deep learning to extract meaning from raw messages, it is vulnerable to\nattacks targeting deep learning models. In this paper, we introduce the model\ninversion eavesdropping attack (MIEA) to reveal the risk of privacy leaks in\nthe semantic communication system. In MIEA, the attacker first eavesdrops the\nsignal being transmitted by the semantic communication system and then performs\nmodel inversion attack to reconstruct the raw message, where both the white-box\nand black-box settings are considered. Evaluation results show that MIEA can\nsuccessfully reconstruct the raw message with good quality under different\nchannel conditions. We then propose a defense method based on random\npermutation and substitution to defend against MIEA in order to achieve secure\nsemantic communication. Our experimental results demonstrate the effectiveness\nof the proposed defense method in preventing MIEA.\n","authors":["Yuhao Chen","Qianqian Yang","Zhiguo Shi","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2308.04304v1.pdf","comment":"Accepted by 2023 IEEE Global Communications Conference (GLOBECOM)"},{"id":"http://arxiv.org/abs/2105.02796v2","updated":"2023-08-08T14:34:33Z","published":"2021-05-06T16:41:04Z","title":"Practical and Rigorous Uncertainty Bounds for Gaussian Process\n  Regression","summary":"  Gaussian Process Regression is a popular nonparametric regression method\nbased on Bayesian principles that provides uncertainty estimates for its\npredictions. However, these estimates are of a Bayesian nature, whereas for\nsome important applications, like learning-based control with safety\nguarantees, frequentist uncertainty bounds are required. Although such rigorous\nbounds are available for Gaussian Processes, they are too conservative to be\nuseful in applications. This often leads practitioners to replacing these\nbounds by heuristics, thus breaking all theoretical guarantees. To address this\nproblem, we introduce new uncertainty bounds that are rigorous, yet practically\nuseful at the same time. In particular, the bounds can be explicitly evaluated\nand are much less conservative than state of the art results. Furthermore, we\nshow that certain model misspecifications lead to only graceful degradation. We\ndemonstrate these advantages and the usefulness of our results for\nlearning-based control with numerical examples.\n","authors":["Christian Fiedler","Carsten W. Scherer","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2105.02796v2.pdf","comment":"Contains supplementary material and corrections to the original\n  version"},{"id":"http://arxiv.org/abs/2212.04780v3","updated":"2023-08-08T14:30:05Z","published":"2022-12-09T11:18:40Z","title":"Genie: Show Me the Data for Quantization","summary":"  Zero-shot quantization is a promising approach for developing lightweight\ndeep neural networks when data is inaccessible owing to various reasons,\nincluding cost and issues related to privacy. By exploiting the learned\nparameters ($\\mu$ and $\\sigma$) of batch normalization layers in an\nFP32-pre-trained model, zero-shot quantization schemes focus on generating\nsynthetic data. Subsequently, they distill knowledge from the pre-trained model\n(teacher) to the quantized model (student) such that the quantized model can be\noptimized with the synthetic dataset. However, thus far, zero-shot quantization\nhas primarily been discussed in the context of quantization-aware training\nmethods, which require task-specific losses and long-term optimization as much\nas retraining. We thus introduce a post-training quantization scheme for\nzero-shot quantization that produces high-quality quantized networks within a\nfew hours. Furthermore, we propose a framework called Genie~that generates data\nsuited for quantization. With the data synthesized by Genie, we can produce\nrobust quantized models without real datasets, which is comparable to few-shot\nquantization. We also propose a post-training quantization algorithm to enhance\nthe performance of quantized models. By combining them, we can bridge the gap\nbetween zero-shot and few-shot quantization while significantly improving the\nquantization performance compared to that of existing approaches. In other\nwords, we can obtain a unique state-of-the-art zero-shot quantization approach.\nThe code is available at \\url{https://github.com/SamsungLabs/Genie}.\n","authors":["Yongkweon Jeon","Chungman Lee","Ho-young Kim"],"pdf_url":"https://arxiv.org/pdf/2212.04780v3.pdf","comment":"Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie"},{"id":"http://arxiv.org/abs/2308.04286v1","updated":"2023-08-08T14:29:35Z","published":"2023-08-08T14:29:35Z","title":"Comparative Analysis of the wav2vec 2.0 Feature Extractor","summary":"  Automatic speech recognition (ASR) systems typically use handcrafted feature\nextraction pipelines. To avoid their inherent information loss and to achieve\nmore consistent modeling from speech to transcribed text, neural raw waveform\nfeature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,\nwhich has recently gained large popularity, uses a convolutional FE which\noperates directly on the speech waveform. However, it is not yet studied\nextensively in the literature. In this work, we study its capability to replace\nthe standard feature extraction methods in a connectionist temporal\nclassification (CTC) ASR model and compare it to an alternative neural FE. We\nshow that both are competitive with traditional FEs on the LibriSpeech\nbenchmark and analyze the effect of the individual components. Furthermore, we\nanalyze the learned filters and show that the most important information for\nthe ASR system is obtained by a set of bandpass filters.\n","authors":["Peter Vieting","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2308.04286v1.pdf","comment":"Accepted at ITG 2023"},{"id":"http://arxiv.org/abs/2308.04275v1","updated":"2023-08-08T14:17:17Z","published":"2023-08-08T14:17:17Z","title":"In-Context Alignment: Chat with Vanilla Language Models Before\n  Fine-Tuning","summary":"  In this note, we explore inference-time alignment through in-context\nlearning. We consider a vanilla pretrained language model Llama-2 before any\nfine-tuning and retrieve an average of 9 demonstration alignment examples when\nthe model is prompted to follow chat-style instructions. Compared to direct\nprompting, the in-context alignment without changing model weights leads to a\n7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making\nthe vanilla language model comparable to strong baselines with alignment\nfine-tuning.\n","authors":["Xiaochuang Han"],"pdf_url":"https://arxiv.org/pdf/2308.04275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.08130v2","updated":"2023-08-08T14:11:40Z","published":"2021-01-19T16:14:02Z","title":"Machine learning for rapid discovery of laminar flow channel wall\n  modifications that enhance heat transfer","summary":"  Numerical simulation of fluids plays an essential role in modeling many\nphysical phenomena, which enables technological advancements, contributes to\nsustainable practices, and expands our understanding of various natural and\nengineered systems. The calculation of heat transfer in fluid flow in simple\nflat channels is a relatively easy task for various simulation methods.\nHowever, once the channel geometry becomes more complex, numerical simulations\nbecome a bottleneck in optimizing wall geometries. We present a combination of\naccurate numerical simulations of arbitrary, flat, and non-flat channels and\nmachine learning models predicting drag coefficient and Stanton number. We show\nthat convolutional neural networks (CNN) can accurately predict the target\nproperties at a fraction of the time of numerical simulations. We use the CNN\nmodels in a virtual high-throughput screening approach to explore a large\nnumber of possible, randomly generated wall architectures. Data Augmentation\nwas applied to existing geometries data to add generated new training data\nwhich have the same number of parameters of heat transfer to improve the\nmodel's generalization. The general approach is not only applicable to simple\nflow setups as presented here but can be extended to more complex tasks, such\nas multiphase or even reactive unit operations in chemical engineering.\n","authors":["Yuri Koide","Arjun J. Kaithakkal","Matthias Schniewind","Bradley P. Ladewig","Alexander Stroh","Pascal Friederich"],"pdf_url":"https://arxiv.org/pdf/2101.08130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04268v1","updated":"2023-08-08T14:09:33Z","published":"2023-08-08T14:09:33Z","title":"Teacher-Student Architecture for Knowledge Distillation: A Survey","summary":"  Although Deep neural networks (DNNs) have shown a strong capacity to solve\nlarge-scale problems in many areas, such DNNs are hard to be deployed in\nreal-world systems due to their voluminous parameters. To tackle this issue,\nTeacher-Student architectures were proposed, where simple student networks with\na few parameters can achieve comparable performance to deep teacher networks\nwith many parameters. Recently, Teacher-Student architectures have been\neffectively and widely embraced on various knowledge distillation (KD)\nobjectives, including knowledge compression, knowledge expansion, knowledge\nadaptation, and knowledge enhancement. With the help of Teacher-Student\narchitectures, current studies are able to achieve multiple distillation\nobjectives through lightweight and generalized student networks. Different from\nexisting KD surveys that primarily focus on knowledge compression, this survey\nfirst explores Teacher-Student architectures across multiple distillation\nobjectives. This survey presents an introduction to various knowledge\nrepresentations and their corresponding optimization objectives. Additionally,\nwe provide a systematic overview of Teacher-Student architectures with\nrepresentative learning algorithms and effective distillation schemes. This\nsurvey also summarizes recent applications of Teacher-Student architectures\nacross multiple purposes, including classification, recognition, generation,\nranking, and regression. Lastly, potential research directions in KD are\ninvestigated, focusing on architecture design, knowledge quality, and\ntheoretical studies of regression-based learning, respectively. Through this\ncomprehensive survey, industry practitioners and the academic community can\ngain valuable insights and guidelines for effectively designing, learning, and\napplying Teacher-Student architectures on various distillation objectives.\n","authors":["Chengming Hu","Xuan Li","Dan Liu","Haolun Wu","Xi Chen","Ju Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04268v1.pdf","comment":"20 pages. arXiv admin note: substantial text overlap with\n  arXiv:2210.17332"},{"id":"http://arxiv.org/abs/2308.04263v1","updated":"2023-08-08T13:59:56Z","published":"2023-08-08T13:59:56Z","title":"BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning","summary":"  This paper introduces BarlowRL, a data-efficient reinforcement learning agent\nthat combines the Barlow Twins self-supervised learning framework with DER\n(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its\ncontrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids\ndimensional collapse by enforcing information spread to the whole space. This\nhelps RL algorithms to utilize uniformly spread state representation that\neventually results in a remarkable performance. The integration of Barlow Twins\nwith DER enhances data efficiency and achieves superior performance in the RL\ntasks. BarlowRL demonstrates the potential of incorporating self-supervised\nlearning techniques to improve RL algorithms.\n","authors":["Omer Veysel Cagatan"],"pdf_url":"https://arxiv.org/pdf/2308.04263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04262v1","updated":"2023-08-08T13:59:16Z","published":"2023-08-08T13:59:16Z","title":"SDLFormer: A Sparse and Dense Locality-enhanced Transformer for\n  Accelerated MR Image Reconstruction","summary":"  Transformers have emerged as viable alternatives to convolutional neural\nnetworks owing to their ability to learn non-local region relationships in the\nspatial domain. The self-attention mechanism of the transformer enables\ntransformers to capture long-range dependencies in the images, which might be\ndesirable for accelerated MRI image reconstruction as the effect of\nundersampling is non-local in the image domain. Despite its computational\nefficiency, the window-based transformers suffer from restricted receptive\nfields as the dependencies are limited to within the scope of the image\nwindows. We propose a window-based transformer network that integrates dilated\nattention mechanism and convolution for accelerated MRI image reconstruction.\nThe proposed network consists of dilated and dense neighborhood attention\ntransformers to enhance the distant neighborhood pixel relationship and\nintroduce depth-wise convolutions within the transformer module to learn\nlow-level translation invariant features for accelerated MRI image\nreconstruction. The proposed model is trained in a self-supervised manner. We\nperform extensive experiments for multi-coil MRI acceleration for coronal PD,\ncoronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in\nself-supervised learning based on k-space splitting. We compare our method\nagainst other reconstruction architectures and the parallel domain\nself-supervised learning baseline. Results show that the proposed model\nexhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in\nSSIM on average over other architectures (ii) around 1.44 dB in PSNR and around\n0.029 in SSIM over parallel domain self-supervised learning. The code is\navailable at https://github.com/rahul-gs-16/sdlformer.git\n","authors":["Rahul G. S.","Sriprabha Ramnarayanan","Mohammad Al Fahim","Keerthi Ram","Preejith S. P","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.04262v1.pdf","comment":"Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with\n  noisy and Limited Data"},{"id":"http://arxiv.org/abs/2308.04258v1","updated":"2023-08-08T13:46:55Z","published":"2023-08-08T13:46:55Z","title":"Advancing Natural-Language Based Audio Retrieval with PaSST and Large\n  Audio-Caption Data Sets","summary":"  This work presents a text-to-audio-retrieval system based on pre-trained text\nand spectrogram transformers. Our method projects recordings and textual\ndescriptions into a shared audio-caption space in which related examples from\ndifferent modalities are close. Through a systematic analysis, we examine how\neach component of the system influences retrieval performance. As a result, we\nidentify two key components that play a crucial role in driving performance:\nthe self-attention-based audio encoder for audio embedding and the utilization\nof additional human-generated and synthetic data sets during pre-training. We\nfurther experimented with augmenting ClothoV2 captions with available keywords\nto increase their variety; however, this only led to marginal improvements. Our\nsystem ranked first in the 2023's DCASE Challenge, and it outperforms the\ncurrent state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.\n","authors":["Paul Primus","Khaled Koutini","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2308.04258v1.pdf","comment":"submitted to DCASE Workshop 2023"},{"id":"http://arxiv.org/abs/2307.11661v2","updated":"2023-08-08T13:44:12Z","published":"2023-07-21T15:49:59Z","title":"Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts","summary":"  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have\nrevolutionized visual representation learning by providing good performance on\ndownstream datasets. VLMs are 0-shot adapted to a downstream dataset by\ndesigning prompts that are relevant to the dataset. Such prompt engineering\nmakes use of domain expertise and a validation dataset. Meanwhile, recent\ndevelopments in generative pretrained models like GPT-4 mean they can be used\nas advanced internet search tools. They can also be manipulated to provide\nvisual information in any structure. In this work, we show that GPT-4 can be\nused to generate text that is visually descriptive and how this can be used to\nadapt CLIP to downstream tasks. We show considerable improvements in 0-shot\ntransfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD\n(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.\nWe also design a simple few-shot adapter that learns to choose the best\npossible sentences to construct generalizable classifiers that outperform the\nrecently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized\nfine-grained datasets. The code, prompts, and auxiliary text dataset is\navailable at https://github.com/mayug/VDT-Adapter.\n","authors":["Mayug Maniparambil","Chris Vorster","Derek Molloy","Noel Murphy","Kevin McGuinness","Noel E. O'Connor"],"pdf_url":"https://arxiv.org/pdf/2307.11661v2.pdf","comment":"Paper accepted at ICCV-W 2023. V2 contains additional comparisons\n  with concurrent works"},{"id":"http://arxiv.org/abs/2308.04237v1","updated":"2023-08-08T13:03:36Z","published":"2023-08-08T13:03:36Z","title":"Federated Inference with Reliable Uncertainty Quantification over\n  Wireless Channels via Conformal Prediction","summary":"  Consider a setting in which devices and a server share a pre-trained model.\nThe server wishes to make an inference on a new input given the model. Devices\nhave access to data, previously not used for training, and can communicate to\nthe server over a common wireless channel. If the devices have no access to the\nnew input, can communication from devices to the server enhance the quality of\nthe inference decision at the server? Recent work has introduced federated\nconformal prediction (CP), which leverages devices-to-server communication to\nimprove the reliability of the server's decision. With federated CP, devices\ncommunicate to the server information about the loss accrued by the shared\npre-trained model on the local data, and the server leverages this information\nto calibrate a decision interval, or set, so that it is guaranteed to contain\nthe correct answer with a pre-defined target reliability level. Previous work\nassumed noise-free communication, whereby devices can communicate a single real\nnumber to the server. In this paper, we study for the first time federated CP\nin a wireless setting. We introduce a novel protocol, termed wireless federated\nconformal prediction (WFCP), which builds on type-based multiple access (TBMA)\nand on a novel quantile correction strategy. WFCP is proved to provide formal\nreliability guarantees in terms of coverage of the predicted set produced by\nthe server. Using numerical results, we demonstrate the significant advantages\nof WFCP against digital implementations of existing federated CP schemes,\nespecially in regimes with limited communication resources and/or large number\nof devices.\n","authors":["Meiyi Zhu","Matteo Zecchin","Sangwoo Park","Caili Guo","Chunyan Feng","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2308.04237v1.pdf","comment":"33 pages, 6 figures"},{"id":"http://arxiv.org/abs/2304.08134v3","updated":"2023-08-08T12:57:36Z","published":"2023-04-17T10:29:26Z","title":"Tackling Face Verification Edge Cases: In-Depth Analysis and\n  Human-Machine Fusion Approach","summary":"  Nowadays, face recognition systems surpass human performance on several\ndatasets. However, there are still edge cases that the machine can't correctly\nclassify. This paper investigates the effect of a combination of machine and\nhuman operators in the face verification task. First, we look closer at the\nedge cases for several state-of-the-art models to discover common datasets'\nchallenging settings. Then, we conduct a study with 60 participants on these\nselected tasks with humans and provide an extensive analysis. Finally, we\ndemonstrate that combining machine and human decisions can further improve the\nperformance of state-of-the-art face verification systems on various benchmark\ndatasets. Code and data are publicly available on GitHub.\n","authors":["Martin Knoche","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2304.08134v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14353v3","updated":"2023-08-08T12:53:23Z","published":"2023-02-28T07:11:55Z","title":"A semantic backdoor attack against Graph Convolutional Networks","summary":"  Graph convolutional networks (GCNs) have been very effective in addressing\nthe issue of various graph-structured related tasks, such as node\nclassification and graph classification. However, recent research has shown\nthat GCNs are vulnerable to a new type of threat called a backdoor attack,\nwhere the adversary can inject a hidden backdoor into GCNs so that the attacked\nmodel performs well on benign samples, but its prediction will be maliciously\nchanged to the attacker-specified target label if the hidden backdoor is\nactivated by the attacker-defined trigger. In this paper, we investigate\nwhether such semantic backdoor attacks are possible for GCNs and propose a\nsemantic backdoor attack against GCNs (SBAG) under the context of graph\nclassification to reveal the existence of this security vulnerability in GCNs.\nSBAG uses a certain type of node in the samples as a backdoor trigger and\ninjects a hidden backdoor into GCN models by poisoning training data. The\nbackdoor will be activated, and the GCN models will give malicious\nclassification results specified by the attacker even on unmodified samples as\nlong as the samples contain enough trigger nodes. We evaluate SBAG on four\ngraph datasets. The experimental results indicate that SBAG can achieve attack\nsuccess rates of approximately 99.9% and over 82% for two kinds of attack\nsamples, respectively, with poisoning rates of less than 5%.\n","authors":["Jiazhu Dai","Zhipeng Xiong"],"pdf_url":"https://arxiv.org/pdf/2302.14353v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04226v1","updated":"2023-08-08T12:45:01Z","published":"2023-08-08T12:45:01Z","title":"OpinionConv: Conversational Product Search with Grounded Opinions","summary":"  When searching for products, the opinions of others play an important role in\nmaking informed decisions. Subjective experiences about a product can be a\nvaluable source of information. This is also true in sales conversations, where\na customer and a sales assistant exchange facts and opinions about products.\nHowever, training an AI for such conversations is complicated by the fact that\nlanguage models do not possess authentic opinions for their lack of real-world\nexperience. We address this problem by leveraging product reviews as a rich\nsource of product opinions to ground conversational AI in true subjective\nnarratives. With OpinionConv, we develop the first conversational AI for\nsimulating sales conversations. To validate the generated conversations, we\nconduct several user studies showing that the generated opinions are perceived\nas realistic. Our assessors also confirm the importance of opinions as an\ninformative basis for decision-making.\n","authors":["Vahid Sadiri Javadi","Martin Potthast","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.04226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04220v1","updated":"2023-08-08T12:34:32Z","published":"2023-08-08T12:34:32Z","title":"Semantic Interpretation and Validation of Graph Attention-based\n  Explanations for GNN Models","summary":"  In this work, we propose a methodology for investigating the application of\nsemantic attention to enhance the explainability of Graph Neural Network\n(GNN)-based models, introducing semantically-informed perturbations and\nestablishing a correlation between predicted feature-importance weights and\nmodel accuracy. Graph Deep Learning (GDL) has emerged as a promising field for\ntasks like scene interpretation, leveraging flexible graph structures to\nconcisely describe complex features and relationships. As traditional\nexplainability methods used in eXplainable AI (XAI) cannot be directly applied\nto such structures, graph-specific approaches are introduced. Attention\nmechanisms have demonstrated their efficacy in estimating the importance of\ninput features in deep learning models and thus have been previously employed\nto provide feature-based explanations for GNN predictions. Building upon these\ninsights, we extend existing attention-based graph-explainability methods\ninvestigating the use of attention weights as importance indicators of\nsemantically sorted feature sets. Through analysing the behaviour of predicted\nattention-weights distribution in correlation with model accuracy, we gain\nvaluable insights into feature importance with respect to the behaviour of the\nGNN model. We apply our methodology to a lidar pointcloud estimation model\nsuccessfully identifying key semantic classes that contribute to enhanced\nperformance effectively generating reliable post-hoc semantic explanations.\n","authors":["Efimia Panagiotaki","Daniele De Martini","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2308.04220v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.07909v2","updated":"2023-08-08T12:30:03Z","published":"2022-11-15T05:29:58Z","title":"Selective Memory Recursive Least Squares: Recast Forgetting into Memory\n  in RBF Neural Network Based Real-Time Learning","summary":"  In radial basis function neural network (RBFNN) based real-time learning\ntasks, forgetting mechanisms are widely used such that the neural network can\nkeep its sensitivity to new data. However, with forgetting mechanisms, some\nuseful knowledge will get lost simply because they are learned a long time ago,\nwhich we refer to as the passive knowledge forgetting phenomenon. To address\nthis problem, this paper proposes a real-time training method named selective\nmemory recursive least squares (SMRLS) in which the classical forgetting\nmechanisms are recast into a memory mechanism. Different from the forgetting\nmechanism, which mainly evaluates the importance of samples according to the\ntime when samples are collected, the memory mechanism evaluates the importance\nof samples through both temporal and spatial distribution of samples. With\nSMRLS, the input space of the RBFNN is evenly divided into a finite number of\npartitions and a synthesized objective function is developed using synthesized\nsamples from each partition. In addition to the current approximation error,\nthe neural network also updates its weights according to the recorded data from\nthe partition being visited. Compared with classical training methods including\nthe forgetting factor recursive least squares (FFRLS) and stochastic gradient\ndescent (SGD) methods, SMRLS achieves improved learning speed and\ngeneralization capability, which are demonstrated by corresponding simulation\nresults.\n","authors":["Yiming Fei","Jiangang Li","Yanan Li"],"pdf_url":"https://arxiv.org/pdf/2211.07909v2.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2308.04212v1","updated":"2023-08-08T12:22:09Z","published":"2023-08-08T12:22:09Z","title":"Varying-coefficients for regional quantile via KNN-based LASSO with\n  applications to health outcome study","summary":"  Health outcomes, such as body mass index and cholesterol levels, are known to\nbe dependent on age and exhibit varying effects with their associated risk\nfactors. In this paper, we propose a novel framework for dynamic modeling of\nthe associations between health outcomes and risk factors using\nvarying-coefficients (VC) regional quantile regression via K-nearest neighbors\n(KNN) fused Lasso, which captures the time-varying effects of age. The proposed\nmethod has strong theoretical properties, including a tight estimation error\nbound and the ability to detect exact clustered patterns under certain\nregularity conditions. To efficiently solve the resulting optimization problem,\nwe develop an alternating direction method of multipliers (ADMM) algorithm. Our\nempirical results demonstrate the efficacy of the proposed method in capturing\nthe complex age-dependent associations between health outcomes and their risk\nfactors.\n","authors":["Seyoung Park","Eun Ryung Lee","Hyokyoung G. Hong"],"pdf_url":"https://arxiv.org/pdf/2308.04212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.06926v4","updated":"2023-08-08T11:45:08Z","published":"2020-06-12T03:19:48Z","title":"Learning Bayesian Networks with Annealing Machine","summary":"  Recent studies have reported that annealing machines are capable of solving\ncombinatorial optimization problems with high accuracy. Annealing machines can\npotentially be applied to score-based Bayesian network structure learning.\nHowever, the bit capacity of an annealing machine is currently limited. To\nutilize the annealing technology, converting score-based learning problems into\nquadratic unconstrained binary optimizations within the bit capacity is\nnecessary. In this paper, we propose an efficient conversion method with the\nadvanced identification of candidate parent sets and their decomposition. We\nalso provide an integer programming problem to find the decomposition that\nminimizes the number of required bits. Experimental results on $7$ benchmark\ndatasets with variables from $75$ to $223$ show that our approach requires less\nbits than the $100$K bit capacity of the fourth-generation Fujitsu Digital\nAnnealer, a fully coupled annealing machine developed with semiconductor\ntechnology. Moreover, we demonstrate that the Digital Annealer with our\nconversion method outperforms existing algorithms on score maximization. These\nresults highlight the utility of annealing processors in learning Bayesian\nnetworks.\n","authors":["Yuta Shikuri"],"pdf_url":"https://arxiv.org/pdf/2006.06926v4.pdf","comment":"13 pages, 5 tables, 3 figures, NeurIPS 2023 (under review)"},{"id":"http://arxiv.org/abs/2303.00286v3","updated":"2023-08-08T11:34:24Z","published":"2023-03-01T07:25:28Z","title":"Treat Different Negatives Differently: Enriching Loss Functions with\n  Domain and Range Constraints for Link Prediction","summary":"  Knowledge graph embedding models (KGEMs) are used for various tasks related\nto knowledge graphs (KGs), including link prediction. They are trained with\nloss functions that are computed considering a batch of scored triples and\ntheir corresponding labels. Traditional approaches consider the label of a\ntriple to be either true or false. However, recent works suggest that all\nnegative triples should not be valued equally. In line with this recent\nassumption, we posit that negative triples that are semantically valid w.r.t.\ndomain and range constraints might be high-quality negative triples. As such,\nloss functions should treat them differently from semantically invalid negative\nones. To this aim, we propose semantic-driven versions for the three main loss\nfunctions for link prediction. In an extensive and controlled experimental\nsetting, we show that the proposed loss functions systematically provide\nsatisfying results on three public benchmark KGs underpinned with different\nschemas, which demonstrates both the generality and superiority of our proposed\napproach. In fact, the proposed loss functions do (1) lead to better MRR and\nHits@10 values, (2) drive KGEMs towards better semantic awareness as measured\nby the Sem@K metric. This highlights that semantic information globally\nimproves KGEMs, and thus should be incorporated into loss functions. Domains\nand ranges of relations being largely available in schema-defined KGs, this\nmakes our approach both beneficial and widely usable in practice.\n","authors":["Nicolas Hubert","Pierre Monnin","Armelle Brun","Davy Monticolo"],"pdf_url":"https://arxiv.org/pdf/2303.00286v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04185v1","updated":"2023-08-08T11:10:42Z","published":"2023-08-08T11:10:42Z","title":"Iterative Sketching for Secure Coded Regression","summary":"  In this work, we propose methods for speeding up linear regression\ndistributively, while ensuring security. We leverage randomized sketching\ntechniques, and improve straggler resilience in asynchronous systems.\nSpecifically, we apply a random orthonormal matrix and then subsample\n\\textit{blocks}, to simultaneously secure the information and reduce the\ndimension of the regression problem. In our setup, the transformation\ncorresponds to an encoded encryption in an \\textit{approximate gradient coding\nscheme}, and the subsampling corresponds to the responses of the non-straggling\nworkers; in a centralized coded computing network. This results in a\ndistributive \\textit{iterative sketching} approach for an $\\ell_2$-subspace\nembedding, \\textit{i.e.} a new sketch is considered at each iteration. We also\nfocus on the special case of the \\textit{Subsampled Randomized Hadamard\nTransform}, which we generalize to block sampling; and discuss how it can be\nmodified in order to secure the data.\n","authors":["Neophytos Charalambides","Hessam Mahdavifar","Mert Pilanci","Alfred O. Hero III"],"pdf_url":"https://arxiv.org/pdf/2308.04185v1.pdf","comment":"28 pages, 7 figures. arXiv admin note: substantial text overlap with\n  arXiv:2201.08522"},{"id":"http://arxiv.org/abs/2111.10275v3","updated":"2023-08-08T11:05:04Z","published":"2021-11-19T15:25:06Z","title":"Composite Goodness-of-fit Tests with Kernels","summary":"  Model misspecification can create significant challenges for the\nimplementation of probabilistic models, and this has led to development of a\nrange of robust methods which directly account for this issue. However, whether\nthese more involved methods are required will depend on whether the model is\nreally misspecified, and there is a lack of generally applicable methods to\nanswer this question. In this paper, we propose one such method. More\nprecisely, we propose kernel-based hypothesis tests for the challenging\ncomposite testing problem, where we are interested in whether the data comes\nfrom any distribution in some parametric family. Our tests make use of minimum\ndistance estimators based on the maximum mean discrepancy and the kernel Stein\ndiscrepancy. They are widely applicable, including whenever the density of the\nparametric model is known up to normalisation constant, or if the model takes\nthe form of a simulator. As our main result, we show that we are able to\nestimate the parameter and conduct our test on the same data (without data\nsplitting), while maintaining a correct test level. Our approach is illustrated\non a range of problems, including testing for goodness-of-fit of an\nunnormalised non-parametric density model, and an intractable generative model\nof a biological cellular network.\n","authors":["Oscar Key","Arthur Gretton","François-Xavier Briol","Tamara Fernandez"],"pdf_url":"https://arxiv.org/pdf/2111.10275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04180v1","updated":"2023-08-08T10:42:33Z","published":"2023-08-08T10:42:33Z","title":"Studying Socially Unacceptable Discourse Classification (SUD) through\n  different eyes: \"Are we on the same page ?\"","summary":"  We study Socially Unacceptable Discourse (SUD) characterization and detection\nin online text. We first build and present a novel corpus that contains a large\nvariety of manually annotated texts from different online sources used so far\nin state-of-the-art Machine learning (ML) SUD detection solutions. This global\ncontext allows us to test the generalization ability of SUD classifiers that\nacquire knowledge around the same SUD categories, but from different contexts.\nFrom this perspective, we can analyze how (possibly) different annotation\nmodalities influence SUD learning by discussing open challenges and open\nresearch directions. We also provide several data insights which can support\ndomain experts in the annotation task.\n","authors":["Bruno Machado Carneiro","Michele Linardi","Julien Longhi"],"pdf_url":"https://arxiv.org/pdf/2308.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14915v2","updated":"2023-08-08T10:30:54Z","published":"2022-09-29T16:22:46Z","title":"Spiking Neural Networks for event-based action recognition: A new task\n  to understand their advantage","summary":"  Spiking Neural Networks (SNN) are characterised by their unique temporal\ndynamics, but the properties and advantages of such computations are still not\nwell understood. In order to provide answers, in this work we demonstrate how\nSpiking neurons can enable temporal feature extraction in feed-forward neural\nnetworks without the need for recurrent synapses, showing how their\nbio-inspired computing principles can be successfully exploited beyond energy\nefficiency gains and evidencing their differences with respect to conventional\nneurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain\n(DVS-GC), which allows, for the first time, to evaluate the perception of\ntemporal dependencies in a real event-based action recognition dataset. Our\nstudy proves how the widely used DVS Gesture benchmark could be solved by\nnetworks without temporal feature extraction, unlike the new DVS-GC which\ndemands an understanding of the ordering of the events. Furthermore, this setup\nallowed us to unveil the role of the leakage rate in spiking neurons for\ntemporal processing tasks and demonstrated the benefits of \"hard reset\"\nmechanisms. Additionally, we also show how time-dependent weights and\nnormalization can lead to understanding order by means of temporal attention.\n","authors":["Alex Vicente-Sola","Davide L. Manna","Paul Kirkland","Gaetano Di Caterina","Trevor Bihl"],"pdf_url":"https://arxiv.org/pdf/2209.14915v2.pdf","comment":"New article superseding the one in previous versions"},{"id":"http://arxiv.org/abs/2301.10227v2","updated":"2023-08-08T10:18:04Z","published":"2023-01-02T14:17:08Z","title":"Denoising Diffusion Probabilistic Models for Generation of Realistic\n  Fully-Annotated Microscopy Image Data Sets","summary":"  Recent advances in computer vision have led to significant progress in the\ngeneration of realistic image data, with denoising diffusion probabilistic\nmodels proving to be a particularly effective method. In this study, we\ndemonstrate that diffusion models can effectively generate fully-annotated\nmicroscopy image data sets through an unsupervised and intuitive approach,\nusing rough sketches of desired structures as the starting point. The proposed\npipeline helps to reduce the reliance on manual annotations when training deep\nlearning-based segmentation approaches and enables the segmentation of diverse\ndatasets without the need for human annotations. This approach holds great\npromise in streamlining the data generation process and enabling a more\nefficient and scalable training of segmentation models, as we show in the\nexample of different practical experiments involving various organisms and cell\ntypes.\n","authors":["Dennis Eschweiler","Rüveyda Yilmaz","Matisse Baumann","Ina Laube","Rijo Roy","Abin Jose","Daniel Brückner","Johannes Stegmaier"],"pdf_url":"https://arxiv.org/pdf/2301.10227v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.05609v4","updated":"2023-08-08T10:04:14Z","published":"2023-01-13T15:24:40Z","title":"Co-manipulation of soft-materials estimating deformation from depth\n  images","summary":"  Human-robot co-manipulation of soft materials, such as fabrics, composites,\nand sheets of paper/cardboard, is a challenging operation that presents several\nrelevant industrial applications. Estimating the deformation state of the\nco-manipulated material is one of the main challenges. Viable methods provide\nthe indirect measure by calculating the human-robot relative distance. In this\npaper, we develop a data-driven model to estimate the deformation state of the\nmaterial from a depth image through a Convolutional Neural Network (CNN).\nFirst, we define the deformation state of the material as the relative\nroto-translation from the current robot pose and a human grasping position. The\nmodel estimates the current deformation state through a Convolutional Neural\nNetwork, specifically a DenseNet-121 pretrained on ImageNet.The delta between\nthe current and the desired deformation state is fed to the robot controller\nthat outputs twist commands. The paper describes the developed approach to\nacquire, preprocess the dataset and train the model. The model is compared with\nthe current state-of-the-art method based on a skeletal tracker from cameras.\nResults show that our approach achieves better performances and avoids the\nvarious drawbacks caused by using a skeletal tracker.Finally, we also studied\nthe model performance according to different architectures and dataset\ndimensions to minimize the time required for dataset acquisition\n","authors":["Giorgio Nicola","Enrico Villagrossi","Nicola Pedrocchi"],"pdf_url":"https://arxiv.org/pdf/2301.05609v4.pdf","comment":"Pre-print, Accepted to Robotics and Computer Integrated Manufacturing"},{"id":"http://arxiv.org/abs/2308.04169v1","updated":"2023-08-08T09:59:56Z","published":"2023-08-08T09:59:56Z","title":"Dual input neural networks for positional sound source localization","summary":"  In many signal processing applications, metadata may be advantageously used\nin conjunction with a high dimensional signal to produce a desired output. In\nthe case of classical Sound Source Localization (SSL) algorithms, information\nfrom a high dimensional, multichannel audio signals received by many\ndistributed microphones is combined with information describing acoustic\nproperties of the scene, such as the microphones' coordinates in space, to\nestimate the position of a sound source. We introduce Dual Input Neural\nNetworks (DI-NNs) as a simple and effective way to model these two data types\nin a neural network. We train and evaluate our proposed DI-NN on scenarios of\nvarying difficulty and realism and compare it against an alternative\narchitecture, a classical Least-Squares (LS) method as well as a classical\nConvolutional Recurrent Neural Network (CRNN). Our results show that the DI-NN\nsignificantly outperforms the baselines, achieving a five times lower\nlocalization error than the LS method and two times lower than the CRNN in a\ntest dataset of real recordings.\n","authors":["Eric Grinstein","Vincent W. Neo","Patrick A. Naylor"],"pdf_url":"https://arxiv.org/pdf/2308.04169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02632v2","updated":"2023-08-08T09:21:40Z","published":"2023-08-04T17:44:27Z","title":"Generation of Realistic Synthetic Raw Radar Data for Automated Driving\n  Applications using Generative Adversarial Networks","summary":"  The main approaches for simulating FMCW radar are based on ray tracing, which\nis usually computationally intensive and do not account for background noise.\nThis work proposes a faster method for FMCW radar simulation capable of\ngenerating synthetic raw radar data using generative adversarial networks\n(GAN). The code and pre-trained weights are open-source and available on\nGitHub. This method generates 16 simultaneous chirps, which allows the\ngenerated data to be used for the further development of algorithms for\nprocessing radar data (filtering and clustering). This can increase the\npotential for data augmentation, e.g., by generating data in non-existent or\nsafety-critical scenarios that are not reproducible in real life. In this work,\nthe GAN was trained with radar measurements of a motorcycle and used to\ngenerate synthetic raw radar data of a motorcycle traveling in a straight line.\nFor generating this data, the distance of the motorcycle and Gaussian noise are\nused as input to the neural network. The synthetic generated radar chirps were\nevaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth\n(RA) map is calculated twice: first, based on synthetic data using this GAN\nand, second, based on real data. Based on these RA maps, an algorithm with\nadaptive threshold and edge detection is used for object detection. The results\nhave shown that the data is realistic in terms of coherent radar reflections of\nthe motorcycle and background noise based on the comparison of chirps, the RA\nmaps and the object detection results. Thus, the proposed method in this work\nhas shown to minimize the simulation-to-reality gap for the generation of radar\ndata.\n","authors":["Eduardo C. Fidelis","Fabio Reway","Herick Y. S. Ribeiro","Pietro L. Campos","Werner Huber","Christian Icking","Lester A. Faria","Torsten Schön"],"pdf_url":"https://arxiv.org/pdf/2308.02632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08325v2","updated":"2023-08-08T09:08:01Z","published":"2023-06-14T07:54:53Z","title":"GCformer: An Efficient Framework for Accurate and Scalable Long-Term\n  Multivariate Time Series Forecasting","summary":"  Transformer-based models have emerged as promising tools for time series\nforecasting.\n  However, these model cannot make accurate prediction for long input time\nseries. On the one hand, they failed to capture global dependencies within time\nseries data. On the other hand, the long input sequence usually leads to large\nmodel size and high time complexity.\n  To address these limitations, we present GCformer, which combines a\nstructured global convolutional branch for processing long input sequences with\na local Transformer-based branch for capturing short, recent signals. A\ncohesive framework for a global convolution kernel has been introduced,\nutilizing three distinct parameterization methods. The selected structured\nconvolutional kernel in the global branch has been specifically crafted with\nsublinear complexity, thereby allowing for the efficient and effective\nprocessing of lengthy and noisy input signals. Empirical studies on six\nbenchmark datasets demonstrate that GCformer outperforms state-of-the-art\nmethods, reducing MSE error in multivariate time series benchmarks by 4.38% and\nmodel parameters by 61.92%. In particular, the global convolutional branch can\nserve as a plug-in block to enhance the performance of other models, with an\naverage improvement of 31.93\\%, including various recently published\nTransformer-based models. Our code is publicly available at\nhttps://github.com/zyj-111/GCformer.\n","authors":["YanJun Zhao","Ziqing Ma","Tian Zhou","Liang Sun","Mengni Ye","Yi Qian"],"pdf_url":"https://arxiv.org/pdf/2306.08325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02582v2","updated":"2023-08-08T08:57:20Z","published":"2023-08-01T05:31:36Z","title":"Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain\n  Adapted Least-To-Most Prompting","summary":"  Cross-domain and cross-compositional generalization of Text-to-SQL semantic\nparsing is a challenging task. Existing Large Language Model (LLM) based\nsolutions rely on inference-time retrieval of few-shot exemplars from the\ntraining set to synthesize a run-time prompt for each Natural Language (NL)\ntest query. In contrast, we devise an algorithm which performs offline sampling\nof a minimal set-of few-shots from the training data, with complete coverage of\nSQL clauses, operators and functions, and maximal domain coverage within the\nallowed token length. This allows for synthesis of a fixed Generic Prompt (GP),\nwith a diverse set-of exemplars common across NL test queries, avoiding\nexpensive test time exemplar retrieval. We further auto-adapt the GP to the\ntarget database domain (DA-GP), to better handle cross-domain generalization;\nfollowed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle\ncross-compositional generalization. The synthesis of LTMP-DA-GP is an offline\ntask, to be performed one-time per new database with minimal human\nintervention. Our approach demonstrates superior performance on the KaggleDBQA\ndataset, designed to evaluate generalizability for the Text-to-SQL task. We\nfurther showcase consistent performance improvement of LTMP-DA-GP over GP,\nacross LLMs and databases of KaggleDBQA, highlighting the efficacy and model\nagnostic benefits of our prompt based adapt and decompose approach.\n","authors":["Aseem Arora","Shabbirhussain Bhaisaheb","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2308.02582v2.pdf","comment":"22 Pages"},{"id":"http://arxiv.org/abs/2206.01186v2","updated":"2023-08-08T08:51:45Z","published":"2022-06-01T10:28:18Z","title":"ORC: Network Group-based Knowledge Distillation using Online Role Change","summary":"  In knowledge distillation, since a single, omnipotent teacher network cannot\nsolve all problems, multiple teacher-based knowledge distillations have been\nstudied recently. However, sometimes their improvements are not as good as\nexpected because some immature teachers may transfer the false knowledge to the\nstudent. In this paper, to overcome this limitation and take the efficacy of\nthe multiple networks, we divide the multiple networks into teacher and student\ngroups, respectively. That is, the student group is a set of immature networks\nthat require learning the teacher's knowledge, while the teacher group consists\nof the selected networks that are capable of teaching successfully. We propose\nour online role change strategy where the top-ranked networks in the student\ngroup are able to promote to the teacher group at every iteration. After\ntraining the teacher group using the error samples of the student group to\nrefine the teacher group's knowledge, we transfer the collaborative knowledge\nfrom the teacher group to the student group successfully. We verify the\nsuperiority of the proposed method on CIFAR-10, CIFAR-100, and ImageNet which\nachieves high performance. We further show the generality of our method with\nvarious backbone architectures such as ResNet, WRN, VGG, Mobilenet, and\nShufflenet.\n","authors":["Junyong Choi","Hyeon Cho","Seokhwa Cheung","Wonjun Hwang"],"pdf_url":"https://arxiv.org/pdf/2206.01186v2.pdf","comment":"Accepted at ICCV 2023; Supplementary material would be found at CVF\n  Open Access"},{"id":"http://arxiv.org/abs/2308.04137v1","updated":"2023-08-08T08:50:27Z","published":"2023-08-08T08:50:27Z","title":"Comprehensive Assessment of the Performance of Deep Learning Classifiers\n  Reveals a Surprising Lack of Robustness","summary":"  Reliable and robust evaluation methods are a necessary first step towards\ndeveloping machine learning models that are themselves robust and reliable.\nUnfortunately, current evaluation protocols typically used to assess\nclassifiers fail to comprehensively evaluate performance as they tend to rely\non limited types of test data, and ignore others. For example, using the\nstandard test data fails to evaluate the predictions made by the classifier to\nsamples from classes it was not trained on. On the other hand, testing with\ndata containing samples from unknown classes fails to evaluate how well the\nclassifier can predict the labels for known classes. This article advocates\nbench-marking performance using a wide range of different types of data and\nusing a single metric that can be applied to all such data types to produce a\nconsistent evaluation of performance. Using such a benchmark it is found that\ncurrent deep neural networks, including those trained with methods that are\nbelieved to produce state-of-the-art robustness, are extremely vulnerable to\nmaking mistakes on certain types of data. This means that such models will be\nunreliable in real-world scenarios where they may encounter data from many\ndifferent domains, and that they are insecure as they can easily be fooled into\nmaking the wrong decisions. It is hoped that these results will motivate the\nwider adoption of more comprehensive testing methods that will, in turn, lead\nto the development of more robust machine learning methods in the future.\n  Code is available at:\n\\url{https://codeberg.org/mwspratling/RobustnessEvaluation}\n","authors":["Michael W. Spratling"],"pdf_url":"https://arxiv.org/pdf/2308.04137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18651v3","updated":"2023-08-08T08:48:48Z","published":"2023-05-29T23:06:05Z","title":"UMD: Unsupervised Model Detection for X2X Backdoor Attacks","summary":"  Backdoor (Trojan) attack is a common threat to deep neural networks, where\nsamples from one or more source classes embedded with a backdoor trigger will\nbe misclassified to adversarial target classes. Existing methods for detecting\nwhether a classifier is backdoor attacked are mostly designed for attacks with\na single adversarial target (e.g., all-to-one attack). To the best of our\nknowledge, without supervision, no existing methods can effectively address the\nmore general X2X attack with an arbitrary number of source classes, each paired\nwith an arbitrary target class. In this paper, we propose UMD, the first\nUnsupervised Model Detection method that effectively detects X2X backdoor\nattacks via a joint inference of the adversarial (source, target) class pairs.\nIn particular, we first define a novel transferability statistic to measure and\nselect a subset of putative backdoor class pairs based on a proposed clustering\napproach. Then, these selected class pairs are jointly assessed based on an\naggregation of their reverse-engineered trigger size for detection inference,\nusing a robust and unsupervised anomaly detector we proposed. We conduct\ncomprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show\nthat our unsupervised UMD outperforms SOTA detectors (even with supervision) by\n17%, 4%, and 8%, respectively, in terms of the detection accuracy against\ndiverse X2X attacks. We also show the strong detection performance of UMD\nagainst several strong adaptive attacks.\n","authors":["Zhen Xiang","Zidi Xiong","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2305.18651v3.pdf","comment":"Proceedings of the 40th International Conference on Machine Learning"},{"id":"http://arxiv.org/abs/2308.04126v1","updated":"2023-08-08T08:30:16Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n  and Infinite Data Generation","summary":"  This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text. Our\ncrafted algorithm leverages advancements across multiple operations such as\nvideo/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models. Future\nprospects include optimizing datasets for each modality to encourage unlimited\ndata generation. This robust base will offer priceless insights to models like\nChatGPT, enabling them to create higher quality datasets for video captioning\nand easing question-answering tasks based on video content. OmniDataComposer\ninaugurates a new stage in multimodal learning, imparting enormous potential\nfor augmenting AI's understanding and generation of complex, real-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04119v1","updated":"2023-08-08T08:19:43Z","published":"2023-08-08T08:19:43Z","title":"Constructing Custom Thermodynamics Using Deep Learning","summary":"  One of the most exciting applications of AI is automated scientific discovery\nbased on previously amassed data, coupled with restrictions provided by the\nknown physical principles, including symmetries and conservation laws. Such\nautomated hypothesis creation and verification can assist scientists in\nstudying complex phenomena, where traditional physical intuition may fail. Of\nparticular importance are complex dynamic systems where their time evolution is\nstrongly influenced by varying external parameters. In this paper we develop a\nplatform based on a generalised Onsager principle to learn macroscopic\ndynamical descriptions of arbitrary stochastic dissipative systems directly\nfrom observations of their microscopic trajectories. We focus on systems whose\ncomplexity and sheer sizes render complete microscopic description impractical,\nand constructing theoretical macroscopic models requires extensive domain\nknowledge or trial-and-error. Our machine learning approach addresses this by\nsimultaneously constructing reduced thermodynamic coordinates and interpreting\nthe dynamics on these coordinates. We demonstrate our method by studying\ntheoretically and validating experimentally, the stretching of long polymer\nchains in an externally applied field. Specifically, we learn three\ninterpretable thermodynamic coordinates and build a dynamical landscape of\npolymer stretching, including (1) the identification of stable and transition\nstates and (2) the control of the stretching rate. We further demonstrate the\nuniversality of our approach by applying it to an unrelated problem in a\ndifferent domain: constructing macroscopic dynamics for spatial epidemics,\nshowing that our method addresses wide scientific and technological\napplications.\n","authors":["Xiaoli Chen","Beatrice W. Soh","Zi-En Ooi","Eleonore Vissol-Gaudin","Haijun Yu","Kostya S. Novoselov","Kedar Hippalgaonkar","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2308.04119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1910.06832v3","updated":"2023-08-08T07:50:36Z","published":"2019-10-15T14:47:37Z","title":"Discriminator optimal transport","summary":"  Within a broad class of generative adversarial networks, we show that\ndiscriminator optimization process increases a lower bound of the dual cost\nfunction for the Wasserstein distance between the target distribution $p$ and\nthe generator distribution $p_G$. It implies that the trained discriminator can\napproximate optimal transport (OT) from $p_G$ to $p$.Based on some experiments\nand a bit of OT theory, we propose a discriminator optimal transport (DOT)\nscheme to improve generated images. We show that it improves inception score\nand FID calculated by un-conditional GAN trained by CIFAR-10, STL-10 and a\npublic pre-trained model of conditional GAN by ImageNet.\n","authors":["Akinori Tanaka"],"pdf_url":"https://arxiv.org/pdf/1910.06832v3.pdf","comment":"math errors corrected, note added"},{"id":"http://arxiv.org/abs/2308.04103v1","updated":"2023-08-08T07:38:44Z","published":"2023-08-08T07:38:44Z","title":"Explainable machine learning to enable high-throughput electrical\n  conductivity optimization of doped conjugated polymers","summary":"  The combination of high-throughput experimentation techniques and machine\nlearning (ML) has recently ushered in a new era of accelerated material\ndiscovery, enabling the identification of materials with cutting-edge\nproperties. However, the measurement of certain physical quantities remains\nchallenging to automate. Specifically, meticulous process control,\nexperimentation and laborious measurements are required to achieve optimal\nelectrical conductivity in doped polymer materials. We propose a ML approach,\nwhich relies on readily measured absorbance spectra, to accelerate the workflow\nassociated with measuring electrical conductivity. The first ML model\n(classification model), accurately classifies samples with a conductivity >~25\nto 100 S/cm, achieving a maximum of 100% accuracy rate. For the subset of\nhighly conductive samples, we employed a second ML model (regression model), to\npredict their conductivities, yielding an impressive test R2 value of 0.984. To\nvalidate the approach, we showed that the models, neither trained on the\nsamples with the two highest conductivities of 498 and 506 S/cm, were able to,\nin an extrapolative manner, correctly classify and predict them at satisfactory\nlevels of errors. The proposed ML workflow results in an improvement in the\nefficiency of the conductivity measurements by 89% of the maximum achievable\nusing our experimental techniques. Furthermore, our approach addressed the\ncommon challenge of the lack of explainability in ML models by exploiting\nbespoke mathematical properties of the descriptors and ML model, allowing us to\ngain corroborated insights into the spectral influences on conductivity.\nThrough this study, we offer an accelerated pathway for optimizing the\nproperties of doped polymer materials while showcasing the valuable insights\nthat can be derived from purposeful utilization of ML in experimental science.\n","authors":["Ji Wei Yoon","Adithya Kumar","Pawan Kumar","Kedar Hippalgaonkar","J Senthilnath","Vijila Chellappan"],"pdf_url":"https://arxiv.org/pdf/2308.04103v1.pdf","comment":"33 Pages, 17 figures"},{"id":"http://arxiv.org/abs/2308.04102v1","updated":"2023-08-08T07:33:49Z","published":"2023-08-08T07:33:49Z","title":"Asynchronous Evolution of Deep Neural Network Architectures","summary":"  Many evolutionary algorithms (EAs) take advantage of parallel evaluation of\ncandidates. However, if evaluation times vary significantly, many worker nodes\n(i.e.,\\ compute clients) are idle much of the time, waiting for the next\ngeneration to be created. Evolutionary neural architecture search (ENAS), a\nclass of EAs that optimizes the architecture and hyperparameters of deep neural\nnetworks, is particularly vulnerable to this issue. This paper proposes a\ngeneric asynchronous evaluation strategy (AES) that is then adapted to work\nwith ENAS. AES increases throughput by maintaining a queue of upto $K$\nindividuals ready to be sent to the workers for evaluation and proceeding to\nthe next generation as soon as $M<<K$ individuals have been evaluated by the\nworkers. A suitable value for $M$ is determined experimentally, balancing\ndiversity and efficiency. To showcase the generality and power of AES, it was\nfirst evaluated in 11-bit multiplexer design (a single-population verifiable\ndiscovery task) and then scaled up to ENAS for image captioning (a\nmulti-population open-ended-optimization task). In both problems, a multifold\nperformance improvement was observed, suggesting that AES is a promising method\nfor parallelizing the evolution of complex systems with long and variable\nevaluation times, such as those in ENAS.\n","authors":["Jason Liang","Hormoz Shahrzad","Risto Miikkulainen"],"pdf_url":"https://arxiv.org/pdf/2308.04102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04082v1","updated":"2023-08-08T06:41:10Z","published":"2023-08-08T06:41:10Z","title":"Application-Oriented Benchmarking of Quantum Generative Learning Using\n  QUARK","summary":"  Benchmarking of quantum machine learning (QML) algorithms is challenging due\nto the complexity and variability of QML systems, e.g., regarding model\nansatzes, data sets, training techniques, and hyper-parameters selection. The\nQUantum computing Application benchmaRK (QUARK) framework simplifies and\nstandardizes benchmarking studies for quantum computing applications. Here, we\npropose several extensions of QUARK to include the ability to evaluate the\ntraining and deployment of quantum generative models. We describe the updated\nsoftware architecture and illustrate its flexibility through several example\napplications: (1) We trained different quantum generative models using several\ncircuit ansatzes, data sets, and data transformations. (2) We evaluated our\nmodels on GPU and real quantum hardware. (3) We assessed the generalization\ncapabilities of our generative models using a broad set of metrics that\ncapture, e.g., the novelty and validity of the generated data.\n","authors":["Florian J. Kiwit","Marwa Marso","Philipp Ross","Carlos A. Riofrío","Johannes Klepsch","Andre Luckow"],"pdf_url":"https://arxiv.org/pdf/2308.04082v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.04077v1","updated":"2023-08-08T06:26:54Z","published":"2023-08-08T06:26:54Z","title":"Federated Zeroth-Order Optimization using Trajectory-Informed Surrogate\n  Gradients","summary":"  Federated optimization, an emerging paradigm which finds wide real-world\napplications such as federated learning, enables multiple clients (e.g., edge\ndevices) to collaboratively optimize a global function. The clients do not\nshare their local datasets and typically only share their local gradients.\nHowever, the gradient information is not available in many applications of\nfederated optimization, which hence gives rise to the paradigm of federated\nzeroth-order optimization (ZOO). Existing federated ZOO algorithms suffer from\nthe limitations of query and communication inefficiency, which can be\nattributed to (a) their reliance on a substantial number of function queries\nfor gradient estimation and (b) the significant disparity between their\nrealized local updates and the intended global updates. To this end, we (a)\nintroduce trajectory-informed gradient surrogates which is able to use the\nhistory of function queries during optimization for accurate and\nquery-efficient gradient estimation, and (b) develop the technique of adaptive\ngradient correction using these gradient surrogates to mitigate the\naforementioned disparity. Based on these, we propose the federated zeroth-order\noptimization using trajectory-informed surrogate gradients (FZooS) algorithm\nfor query- and communication-efficient federated ZOO. Our FZooS achieves\ntheoretical improvements over the existing approaches, which is supported by\nour real-world experiments such as federated black-box adversarial attack and\nfederated non-differentiable metric optimization.\n","authors":["Yao Shu","Xiaoqiang Lin","Zhongxiang Dai","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2308.04077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04073v1","updated":"2023-08-08T06:11:52Z","published":"2023-08-08T06:11:52Z","title":"Learning Specialized Activation Functions for Physics-informed Neural\n  Networks","summary":"  Physics-informed neural networks (PINNs) are known to suffer from\noptimization difficulty. In this work, we reveal the connection between the\noptimization difficulty of PINNs and activation functions. Specifically, we\nshow that PINNs exhibit high sensitivity to activation functions when solving\nPDEs with distinct properties. Existing works usually choose activation\nfunctions by inefficient trial-and-error. To avoid the inefficient manual\nselection and to alleviate the optimization difficulty of PINNs, we introduce\nadaptive activation functions to search for the optimal function when solving\ndifferent problems. We compare different adaptive activation functions and\ndiscuss their limitations in the context of PINNs. Furthermore, we propose to\ntailor the idea of learning combinations of candidate activation functions to\nthe PINNs optimization, which has a higher requirement for the smoothness and\ndiversity on learned functions. This is achieved by removing activation\nfunctions which cannot provide higher-order derivatives from the candidate set\nand incorporating elementary functions with different properties according to\nour prior knowledge about the PDE at hand. We further enhance the search space\nwith adaptive slopes. The proposed adaptive activation function can be used to\nsolve different PDE systems in an interpretable way. Its effectiveness is\ndemonstrated on a series of benchmarks. Code is available at\nhttps://github.com/LeapLabTHU/AdaAFforPINNs.\n","authors":["Honghui Wang","Lu Lu","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04071v1","updated":"2023-08-08T06:10:53Z","published":"2023-08-08T06:10:53Z","title":"Path Signatures for Diversity in Probabilistic Trajectory Optimisation","summary":"  Motion planning can be cast as a trajectory optimisation problem where a cost\nis minimised as a function of the trajectory being generated. In complex\nenvironments with several obstacles and complicated geometry, this optimisation\nproblem is usually difficult to solve and prone to local minima. However,\nrecent advancements in computing hardware allow for parallel trajectory\noptimisation where multiple solutions are obtained simultaneously, each\ninitialised from a different starting point. Unfortunately, without a strategy\npreventing two solutions to collapse on each other, naive parallel optimisation\ncan suffer from mode collapse diminishing the efficiency of the approach and\nthe likelihood of finding a global solution. In this paper we leverage on\nrecent advances in the theory of rough paths to devise an algorithm for\nparallel trajectory optimisation that promotes diversity over the range of\nsolutions, therefore avoiding mode collapses and achieving better global\nproperties. Our approach builds on path signatures and Hilbert space\nrepresentations of trajectories, and connects parallel variational inference\nfor trajectory estimation with diversity promoting kernels. We empirically\ndemonstrate that this strategy achieves lower average costs than competing\nalternatives on a range of problems, from 2D navigation to robotic manipulators\noperating in cluttered environments.\n","authors":["Lucas Barcelos","Tin Lai","Rafael Oliveira","Paulo Borges","Fabio Ramos"],"pdf_url":"https://arxiv.org/pdf/2308.04071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04070v1","updated":"2023-08-08T06:07:49Z","published":"2023-08-08T06:07:49Z","title":"ConDistFL: Conditional Distillation for Federated Learning from\n  Partially Annotated Data","summary":"  Developing a generalized segmentation model capable of simultaneously\ndelineating multiple organs and diseases is highly desirable. Federated\nlearning (FL) is a key technology enabling the collaborative development of a\nmodel without exchanging training data. However, the limited access to fully\nannotated training data poses a major challenge to training generalizable\nmodels. We propose \"ConDistFL\", a framework to solve this problem by combining\nFL with knowledge distillation. Local models can extract the knowledge of\nunlabeled organs and tumors from partially annotated data from the global model\nwith an adequately designed conditional probability representation. We validate\nour framework on four distinct partially annotated abdominal CT datasets from\nthe MSD and KiTS19 challenges. The experimental results show that the proposed\nframework significantly outperforms FedAvg and FedOpt baselines. Moreover, the\nperformance on an external test dataset demonstrates superior generalizability\ncompared to models trained on each dataset separately. Our ablation study\nsuggests that ConDistFL can perform well without frequent aggregation, reducing\nthe communication cost of FL. Our implementation will be available at\nhttps://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.\n","authors":["Pochuan Wang","Chen Shen","Weichung Wang","Masahiro Oda","Chiou-Shann Fuh","Kensaku Mori","Holger R. Roth"],"pdf_url":"https://arxiv.org/pdf/2308.04070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05785v2","updated":"2023-08-08T06:06:35Z","published":"2022-09-13T07:37:53Z","title":"Adversarial Coreset Selection for Efficient Robust Training","summary":"  Neural networks are vulnerable to adversarial attacks: adding well-crafted,\nimperceptible perturbations to their input can modify their output. Adversarial\ntraining is one of the most effective approaches to training robust models\nagainst such attacks. Unfortunately, this method is much slower than vanilla\ntraining of neural networks since it needs to construct adversarial examples\nfor the entire training data at every iteration. By leveraging the theory of\ncoreset selection, we show how selecting a small subset of training data\nprovides a principled approach to reducing the time complexity of robust\ntraining. To this end, we first provide convergence guarantees for adversarial\ncoreset selection. In particular, we show that the convergence bound is\ndirectly related to how well our coresets can approximate the gradient computed\nover the entire training data. Motivated by our theoretical analysis, we\npropose using this gradient approximation error as our adversarial coreset\nselection objective to reduce the training set size effectively. Once built, we\nrun adversarial training over this subset of the training data. Unlike existing\nmethods, our approach can be adapted to a wide variety of training objectives,\nincluding TRADES, $\\ell_p$-PGD, and Perceptual Adversarial Training. We conduct\nextensive experiments to demonstrate that our approach speeds up adversarial\ntraining by 2-3 times while experiencing a slight degradation in the clean and\nrobust accuracy.\n","authors":["Hadi M. Dolatabadi","Sarah Erfani","Christopher Leckie"],"pdf_url":"https://arxiv.org/pdf/2209.05785v2.pdf","comment":"Accepted to the International Journal of Computer Vision (IJCV).\n  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:\n  substantial text overlap with arXiv:2112.00378"},{"id":"http://arxiv.org/abs/2305.01160v3","updated":"2023-08-08T05:59:58Z","published":"2023-05-02T02:29:18Z","title":"Long-Tailed Recognition by Mutual Information Maximization between\n  Latent Features and Ground-Truth Labels","summary":"  Although contrastive learning methods have shown prevailing performance on a\nvariety of representation learning tasks, they encounter difficulty when the\ntraining dataset is long-tailed. Many researchers have combined contrastive\nlearning and a logit adjustment technique to address this problem, but the\ncombinations are done ad-hoc and a theoretical background has not yet been\nprovided. The goal of this paper is to provide the background and further\nimprove the performance. First, we show that the fundamental reason contrastive\nlearning methods struggle with long-tailed tasks is that they try to maximize\nthe mutual information maximization between latent features and input data. As\nground-truth labels are not considered in the maximization, they are not able\nto address imbalances between class labels. Rather, we interpret the\nlong-tailed recognition task as a mutual information maximization between\nlatent features and ground-truth labels. This approach integrates contrastive\nlearning and logit adjustment seamlessly to derive a loss function that shows\nstate-of-the-art performance on long-tailed recognition benchmarks. It also\ndemonstrates its efficacy in image segmentation tasks, verifying its\nversatility beyond image classification.\n","authors":["Min-Kook Suh","Seung-Woo Seo"],"pdf_url":"https://arxiv.org/pdf/2305.01160v3.pdf","comment":"ICML 2023 camera-ready"},{"id":"http://arxiv.org/abs/2308.04061v1","updated":"2023-08-08T05:48:38Z","published":"2023-08-08T05:48:38Z","title":"Enhancing Adversarial Robustness in Low-Label Regime via Adaptively\n  Weighted Regularization and Knowledge Distillation","summary":"  Adversarial robustness is a research area that has recently received a lot of\nattention in the quest for trustworthy artificial intelligence. However, recent\nworks on adversarial robustness have focused on supervised learning where it is\nassumed that labeled data is plentiful. In this paper, we investigate\nsemi-supervised adversarial training where labeled data is scarce. We derive\ntwo upper bounds for the robust risk and propose a regularization term for\nunlabeled data motivated by these two upper bounds. Then, we develop a\nsemi-supervised adversarial training algorithm that combines the proposed\nregularization term with knowledge distillation using a semi-supervised teacher\n(i.e., a teacher model trained using a semi-supervised learning algorithm). Our\nexperiments show that our proposed algorithm achieves state-of-the-art\nperformance with significant margins compared to existing algorithms. In\nparticular, compared to supervised learning algorithms, performance of our\nproposed algorithm is not much worse even when the amount of labeled data is\nvery small. For example, our algorithm with only 8\\% labeled data is comparable\nto supervised adversarial training algorithms that use all labeled data, both\nin terms of standard and robust accuracies on CIFAR-10.\n","authors":["Dongyoon Yang","Insung Kong","Yongdai Kim"],"pdf_url":"https://arxiv.org/pdf/2308.04061v1.pdf","comment":"9 pages - Manuscript, 6 pages - Appendix, Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04060v1","updated":"2023-08-08T05:46:03Z","published":"2023-08-08T05:46:03Z","title":"Toward Improving Predictive Risk Modelling for New Zealand's Child\n  Welfare System Using Clustering Methods","summary":"  The combination of clinical judgement and predictive risk models crucially\nassist social workers to segregate children at risk of maltreatment and decide\nwhen authorities should intervene. Predictive risk modelling to address this\nmatter has been initiated by several governmental welfare authorities worldwide\ninvolving administrative data and machine learning algorithms. While previous\nstudies have investigated risk factors relating to child maltreatment, several\ngaps remain as to understanding how such risk factors interact and whether\npredictive risk models perform differently for children with different\nfeatures. By integrating Principal Component Analysis and K-Means clustering,\nthis paper presents initial findings of our work on the identification of such\nfeatures as well as their potential effect on current risk modelling\nframeworks. This approach allows examining existent, unidentified yet, clusters\nof New Zealand (NZ) children reported with care and protection concerns, as\nwell as to analyse their inner structure, and evaluate the performance of\nprediction models trained cluster wise. We aim to discover the extent of\nclustering degree required as an early step in the development of predictive\nrisk models for child maltreatment and so enhance the accuracy of such models\nintended for use by child protection authorities. The results from testing\nLASSO logistic regression models trained on identified clusters revealed no\nsignificant difference in their performance. The models, however, performed\nslightly better for two clusters including younger children. our results\nsuggest that separate models might need to be developed for children of certain\nage to gain additional control over the error rates and to improve model\naccuracy. While results are promising, more evidence is needed to draw\ndefinitive conclusions, and further investigation is necessary.\n","authors":["Sahar Barmomanesh","Victor Miranda-Soberanis"],"pdf_url":"https://arxiv.org/pdf/2308.04060v1.pdf","comment":"20 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2206.11004v3","updated":"2023-08-08T05:31:03Z","published":"2022-06-22T12:07:50Z","title":"Auto-Encoding Adversarial Imitation Learning","summary":"  Reinforcement learning (RL) provides a powerful framework for\ndecision-making, but its application in practice often requires a carefully\ndesigned reward function. Adversarial Imitation Learning (AIL) sheds light on\nautomatic policy acquisition without access to the reward signal from the\nenvironment. In this work, we propose Auto-Encoding Adversarial Imitation\nLearning (AEAIL), a robust and scalable AIL framework. To induce expert\npolicies from demonstrations, AEAIL utilizes the reconstruction error of an\nauto-encoder as a reward signal, which provides more information for optimizing\npolicies than the prior discriminator-based ones. Subsequently, we use the\nderived objective functions to train the auto-encoder and the agent policy.\nExperiments show that our AEAIL performs superior compared to state-of-the-art\nmethods on both state and image based environments. More importantly, AEAIL\nshows much better robustness when the expert demonstrations are noisy.\n","authors":["Kaifeng Zhang","Rui Zhao","Ziming Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2206.11004v3.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.04052v1","updated":"2023-08-08T05:16:51Z","published":"2023-08-08T05:16:51Z","title":"The Five-Dollar Model: Generating Game Maps and Sprites from Sentence\n  Embeddings","summary":"  The five-dollar model is a lightweight text-to-image generative architecture\nthat generates low dimensional images from an encoded text prompt. This model\ncan successfully generate accurate and aesthetically pleasing content in low\ndimensional domains, with limited amounts of training data. Despite the small\nsize of both the model and datasets, the generated images are still able to\nmaintain the encoded semantic meaning of the textual prompt. We apply this\nmodel to three small datasets: pixel art video game maps, video game sprite\nimages, and down-scaled emoji images and apply novel augmentation strategies to\nimprove the performance of our model on these limited datasets. We evaluate our\nmodels performance using cosine similarity score between text-image pairs\ngenerated by the CLIP VIT-B/32 model.\n","authors":["Timothy Merino","Roman Negri","Dipika Rajesh","M Charity","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2308.04052v1.pdf","comment":"to be published in AIIDE 2023"},{"id":"http://arxiv.org/abs/2308.04051v1","updated":"2023-08-08T04:57:58Z","published":"2023-08-08T04:57:58Z","title":"Generative Models for Anomaly Detection and Design-Space Dimensionality\n  Reduction in Shape Optimization","summary":"  Our work presents a novel approach to shape optimization, that has the\ntwofold objective to improve the efficiency of global optimization algorithms\nwhile promoting the generation of high-quality designs during the optimization\nprocess free of geometrical anomalies. This is accomplished by reducing the\nnumber of the original design variables defining a new reduced subspace where\nthe geometrical variance is maximized and modeling the underlying generative\nprocess of the data via probabilistic linear latent variable models such as\nFactor Analysis and Probabilistic Principal Component Analysis. We show that\nthe data follows approximately a Gaussian distribution when the shape\nmodification method is linear and the design variables are sampled uniformly at\nrandom, due to the direct application of the central limit theorem. The model\nuncertainty is measured in terms of Mahalanobis distance, and the paper\ndemonstrates that anomalous designs tend to exhibit a high value of this\nmetric. This enables the definition of a new optimization model where anomalous\ngeometries are penalized and consequently avoided during the optimization loop.\nThe procedure is demonstrated for hull shape optimization of the DTMB 5415\nmodel, extensively used as an international benchmark for shape optimization\nproblems. The global optimization routine is carried out using Bayesian\nOptimization and the DIRECT algorithm. From the numerical results, the new\nframework improves the convergence of global optimization algorithms, while\nonly designs with high-quality geometrical features are generated through the\noptimization routine thereby avoiding the wastage of precious computationally\nexpensive simulations.\n","authors":["Danny D'Agostino"],"pdf_url":"https://arxiv.org/pdf/2308.04051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04037v1","updated":"2023-08-08T04:27:34Z","published":"2023-08-08T04:27:34Z","title":"A Comparative Study on TF-IDF feature Weighting Method and its Analysis\n  using Unstructured Dataset","summary":"  Text Classification is the process of categorizing text into the relevant\ncategories and its algorithms are at the core of many Natural Language\nProcessing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP\nare the most highly used information retrieval methods in text classification.\nWe have investigated and analyzed the feature weighting method for text\nclassification on unstructured data. The proposed model considered two features\nN-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset\nfor sentiment analysis. Then we have used the state-of-the-art classifier to\nvalidate the method i.e., Support Vector Machine (SVM), Logistic Regression,\nMultinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and\nk-nearest neighbors (KNN). From those two feature extractions, a significant\nincrease in feature extraction with TF-IDF features rather than based on\nN-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall\n(93.81%), and F1-score (91.99%) value in Random Forest classifier.\n","authors":["Mamata Das","Selvakumar K.","P. J. A. Alphonse"],"pdf_url":"https://arxiv.org/pdf/2308.04037v1.pdf","comment":"10 pages, 3 figures, COLINS-2021, 5th International Conference on\n  Computational Linguistics and Intelligent Systems, April 22-23, 2021,\n  Kharkiv, Ukraine"},{"id":"http://arxiv.org/abs/2308.04028v1","updated":"2023-08-08T04:06:11Z","published":"2023-08-08T04:06:11Z","title":"Top K Relevant Passage Retrieval for Biomedical Question Answering","summary":"  Question answering is a task that answers factoid questions using a large\ncollection of documents. It aims to provide precise answers in response to the\nuser's questions in natural language. Question answering relies on efficient\npassage retrieval to select candidate contexts, where traditional sparse vector\nspace models, such as TF-IDF or BM25, are the de facto method. On the web,\nthere is no single article that could provide all the possible answers\navailable on the internet to the question of the problem asked by the user. The\nexisting Dense Passage Retrieval model has been trained on Wikipedia dump from\nDec. 20, 2018, as the source documents for answering questions. Question\nanswering (QA) has made big strides with several open-domain and machine\ncomprehension systems built using large-scale annotated datasets. However, in\nthe clinical domain, this problem remains relatively unexplored. According to\nmultiple surveys, Biomedical Questions cannot be answered correctly from\nWikipedia Articles. In this work, we work on the existing DPR framework for the\nbiomedical domain and retrieve answers from the Pubmed articles which is a\nreliable source to answer medical questions. When evaluated on a BioASQ QA\ndataset, our fine-tuned dense retriever results in a 0.81 F1 score.\n","authors":["Shashank Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.04028v1.pdf","comment":"6 pages, 5 figures. arXiv admin note: text overlap with\n  arXiv:2004.04906 by other authors"},{"id":"http://arxiv.org/abs/2308.04024v1","updated":"2023-08-08T03:40:05Z","published":"2023-08-08T03:40:05Z","title":"Scope Loss for Imbalanced Classification and RL Exploration","summary":"  We demonstrate equivalence between the reinforcement learning problem and the\nsupervised classification problem. We consequently equate the exploration\nexploitation trade-off in reinforcement learning to the dataset imbalance\nproblem in supervised classification, and find similarities in how they are\naddressed. From our analysis of the aforementioned problems we derive a novel\nloss function for reinforcement learning and supervised classification. Scope\nLoss, our new loss function, adjusts gradients to prevent performance losses\nfrom over-exploitation and dataset imbalances, without the need for any tuning.\nWe test Scope Loss against SOTA loss functions over a basket of benchmark\nreinforcement learning tasks and a skewed classification dataset, and show that\nScope Loss outperforms other loss functions.\n","authors":["Hasham Burhani","Xiao Qi Shi","Jonathan Jaegerman","Daniel Balicki"],"pdf_url":"https://arxiv.org/pdf/2308.04024v1.pdf","comment":"11 pages, 2 figures, under review for NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.04018v1","updated":"2023-08-08T03:29:43Z","published":"2023-08-08T03:29:43Z","title":"Improving Performance of Semi-Supervised Learning by Adversarial Attacks","summary":"  Semi-supervised learning (SSL) algorithm is a setup built upon a realistic\nassumption that access to a large amount of labeled data is tough. In this\nstudy, we present a generalized framework, named SCAR, standing for Selecting\nClean samples with Adversarial Robustness, for improving the performance of\nrecent SSL algorithms. By adversarially attacking pre-trained models with\nsemi-supervision, our framework shows substantial advances in classifying\nimages. We introduce how adversarial attacks successfully select high-confident\nunlabeled data to be labeled with current predictions. On CIFAR10, three recent\nSSL algorithms with SCAR result in significantly improved image classification.\n","authors":["Dongyoon Yang","Kunwoong Kim","Yongdai Kim"],"pdf_url":"https://arxiv.org/pdf/2308.04018v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2308.04014v1","updated":"2023-08-08T03:18:18Z","published":"2023-08-08T03:18:18Z","title":"Continual Pre-Training of Large Language Models: How to (re)warm your\n  model?","summary":"  Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to restart the process over again once new data becomes available. A much\ncheaper and more efficient solution would be to enable the continual\npre-training of these models, i.e. updating pre-trained models with new data\ninstead of re-training them from scratch. However, the distribution shift\ninduced by novel data typically results in degraded performance on past data.\nTaking a step towards efficient continual pre-training, in this work, we\nexamine the effect of different warm-up strategies. Our hypothesis is that the\nlearning rate must be re-increased to improve compute efficiency when training\non a new dataset. We study the warmup phase of models pre-trained on the Pile\n(upstream data, 300B tokens) as we continue to pre-train on SlimPajama\n(downstream data, 297B tokens), following a linear warmup and cosine decay\nschedule. We conduct all experiments on the Pythia 410M language model\narchitecture and evaluate performance through validation perplexity. We\nexperiment with different pre-training checkpoints, various maximum learning\nrates, and various warmup lengths. Our results show that while rewarming models\nfirst increases the loss on upstream and downstream data, in the longer run it\nimproves the downstream performance, outperforming models trained from\nscratch$\\unicode{x2013}$even for a large downstream dataset.\n","authors":["Kshitij Gupta","Benjamin Thérien","Adam Ibrahim","Mats L. Richter","Quentin Anthony","Eugene Belilovsky","Irina Rish","Timothée Lesort"],"pdf_url":"https://arxiv.org/pdf/2308.04014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04011v1","updated":"2023-08-08T03:14:34Z","published":"2023-08-08T03:14:34Z","title":"Generalization bound for estimating causal effects from observational\n  network data","summary":"  Estimating causal effects from observational network data is a significant\nbut challenging problem. Existing works in causal inference for observational\nnetwork data lack an analysis of the generalization bound, which can\ntheoretically provide support for alleviating the complex confounding bias and\npractically guide the design of learning objectives in a principled manner. To\nfill this gap, we derive a generalization bound for causal effect estimation in\nnetwork scenarios by exploiting 1) the reweighting schema based on joint\npropensity score and 2) the representation learning schema based on Integral\nProbability Metric (IPM). We provide two perspectives on the generalization\nbound in terms of reweighting and representation learning, respectively.\nMotivated by the analysis of the bound, we propose a weighting regression\nmethod based on the joint propensity score augmented with representation\nlearning. Extensive experimental studies on two real-world networks with\nsemi-synthetic data demonstrate the effectiveness of our algorithm.\n","authors":["Ruichu Cai","Zeqin Yang","Weilin Chen","Yuguang Yan","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2308.04011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03713v2","updated":"2023-08-08T02:53:52Z","published":"2023-08-07T16:32:14Z","title":"Communication-Efficient Framework for Distributed Image Semantic\n  Wireless Transmission","summary":"  Multi-node communication, which refers to the interaction among multiple\ndevices, has attracted lots of attention in many Internet-of-Things (IoT)\nscenarios. However, its huge amounts of data flows and inflexibility for task\nextension have triggered the urgent requirement of communication-efficient\ndistributed data transmission frameworks. In this paper, inspired by the great\nsuperiorities on bandwidth reduction and task adaptation of semantic\ncommunications, we propose a federated learning-based semantic communication\n(FLSC) framework for multi-task distributed image transmission with IoT\ndevices. Federated learning enables the design of independent semantic\ncommunication link of each user while further improves the semantic extraction\nand task performance through global aggregation. Each link in FLSC is composed\nof a hierarchical vision transformer (HVT)-based extractor and a task-adaptive\ntranslator for coarse-to-fine semantic extraction and meaning translation\naccording to specific tasks. In order to extend the FLSC into more realistic\nconditions, we design a channel state information-based multiple-input\nmultiple-output transmission module to combat channel fading and noise.\nSimulation results show that the coarse semantic information can deal with a\nrange of image-level tasks. Moreover, especially in low signal-to-noise ratio\nand channel bandwidth ratio regimes, FLSC evidently outperforms the traditional\nscheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel\ncondition.\n","authors":["Bingyan Xie","Yongpeng Wu","Yuxuan Shi","Derrick Wing Kwan Ng","Wenjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03713v2.pdf","comment":"This paper has been accepted by IEEE Internet of Things Journal"},{"id":"http://arxiv.org/abs/2109.03159v6","updated":"2023-08-08T02:52:17Z","published":"2021-09-07T15:51:12Z","title":"Analysis of Regularized Learning for Linear-functional Data in Banach\n  Spaces","summary":"  In this article, we study the whole theory of regularized learning for\nlinear-functional data in Banach spaces including representer theorems,\npseudo-approximation theorems, and convergence theorems. The input training\ndata are composed of linear functionals in the predual space of the Banach\nspace to represent the discrete local information of multimodel data and\nmultiscale models. The training data and the multi-loss functions are used to\ncompute the empirical risks to approximate the expected risks, and the\nregularized learning is to minimize the regularized empirical risks over the\nBanach spaces. The exact solutions of the original problems are approximated\nglobally by the regularized learning even if the original problems are unknown\nor unformulated. In the convergence theorems, we show the convergence of the\napproximate solutions to the exact solutions by the weak* topology of the\nBanach space. Moreover, the theorems of the regularized learning are applied to\nsolve many problems of machine learning such as support vector machines and\nartificial neural networks.\n","authors":["Qi Ye"],"pdf_url":"https://arxiv.org/pdf/2109.03159v6.pdf","comment":"53 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.03999v1","updated":"2023-08-08T02:28:50Z","published":"2023-08-08T02:28:50Z","title":"Understanding CNN Hidden Neuron Activations using Structured Background\n  Knowledge and Deductive Reasoning","summary":"  A major challenge in Explainable AI is in correctly interpreting activations\nof hidden neurons: accurate interpretations would provide insights into the\nquestion of what a deep learning system has internally detected as relevant on\nthe input, de-mystifying the otherwise black-box character of deep learning\nsystems. The state of the art indicates that hidden node activations can, in\nsome cases, be interpretable in a way that makes sense to humans, but\nsystematic automated methods that would be able to hypothesize and verify\ninterpretations of hidden neuron activations are underexplored. In this paper,\nwe provide such a method and demonstrate that it provides meaningful\ninterpretations. Our approach is based on using large-scale background\nknowledge approximately 2 million classes curated from the Wikipedia concept\nhierarchy together with a symbolic reasoning approach called Concept Induction\nbased on description logics, originally developed for applications in the\nSemantic Web field. Our results show that we can automatically attach\nmeaningful labels from the background knowledge to individual neurons in the\ndense layer of a Convolutional Neural Network through a hypothesis and\nverification process\n","authors":["Abhilekha Dalal","Md Kamruzzaman Sarker","Adrita Barua","Eugene Vasserman","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.03999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03995v1","updated":"2023-08-08T02:15:06Z","published":"2023-08-08T02:15:06Z","title":"Cooperative Multi-Type Multi-Agent Deep Reinforcement Learning for\n  Resource Management in Space-Air-Ground Integrated Networks","summary":"  The Space-Air-Ground Integrated Network (SAGIN), integrating heterogeneous\ndevices including low earth orbit (LEO) satellites, unmanned aerial vehicles\n(UAVs), and ground users (GUs), holds significant promise for advancing smart\ncity applications. However, resource management of the SAGIN is a challenge\nrequiring urgent study in that inappropriate resource management will cause\npoor data transmission, and hence affect the services in smart cities. In this\npaper, we develop a comprehensive SAGIN system that encompasses five distinct\ncommunication links and propose an efficient cooperative multi-type multi-agent\ndeep reinforcement learning (CMT-MARL) method to address the resource\nmanagement issue. The experimental results highlight the efficacy of the\nproposed CMT-MARL, as evidenced by key performance indicators such as the\noverall transmission rate and transmission success rate. These results\nunderscore the potential value and feasibility of future implementation of the\nSAGIN.\n","authors":["Hengxi Zhang","Huaze Tang","Wenbo Ding","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00195v2","updated":"2023-08-08T02:06:23Z","published":"2023-02-01T02:58:29Z","title":"Weight Prediction Boosts the Convergence of AdamW","summary":"  In this paper, we introduce weight prediction into the AdamW optimizer to\nboost its convergence when training the deep neural network (DNN) models. In\nparticular, ahead of each mini-batch training, we predict the future weights\naccording to the update rule of AdamW and then apply the predicted future\nweights to do both forward pass and backward propagation. In this way, the\nAdamW optimizer always utilizes the gradients w.r.t. the future weights instead\nof current weights to update the DNN parameters, making the AdamW optimizer\nachieve better convergence. Our proposal is simple and straightforward to\nimplement but effective in boosting the convergence of DNN training. We\nperformed extensive experimental evaluations on image classification and\nlanguage modeling tasks to verify the effectiveness of our proposal. The\nexperimental results validate that our proposal can boost the convergence of\nAdamW and achieve better accuracy than AdamW when training the DNN models.\n","authors":["Lei Guan"],"pdf_url":"https://arxiv.org/pdf/2302.00195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03985v1","updated":"2023-08-08T02:03:47Z","published":"2023-08-08T02:03:47Z","title":"Fourier neural operator for real-time simulation of 3D dynamic urban\n  microclimate","summary":"  Global urbanization has underscored the significance of urban microclimates\nfor human comfort, health, and building/urban energy efficiency. They\nprofoundly influence building design and urban planning as major environmental\nimpacts. Understanding local microclimates is essential for cities to prepare\nfor climate change and effectively implement resilience measures. However,\nanalyzing urban microclimates requires considering a complex array of outdoor\nparameters within computational domains at the city scale over a longer period\nthan indoors. As a result, numerical methods like Computational Fluid Dynamics\n(CFD) become computationally expensive when evaluating the impact of urban\nmicroclimates. The rise of deep learning techniques has opened new\nopportunities for accelerating the modeling of complex non-linear interactions\nand system dynamics. Recently, the Fourier Neural Operator (FNO) has been shown\nto be very promising in accelerating solving the Partial Differential Equations\n(PDEs) and modeling fluid dynamic systems. In this work, we apply the FNO\nnetwork for real-time three-dimensional (3D) urban wind field simulation. The\ntraining and testing data are generated from CFD simulation of the urban area,\nbased on the semi-Lagrangian approach and fractional stepping method to\nsimulate urban microclimate features for modeling large-scale urban problems.\nNumerical experiments show that the FNO model can accurately reconstruct the\ninstantaneous spatial velocity field. We further evaluate the trained FNO model\non unseen data with different wind directions, and the results show that the\nFNO model can generalize well on different wind directions. More importantly,\nthe FNO approach can make predictions within milliseconds on the graphics\nprocessing unit, making real-time simulation of 3D dynamic urban microclimate\npossible.\n","authors":["Wenhui Peng","Shaoxiang Qin","Senwen Yang","Jianchun Wang","Xue Liu"," Liangzhu"," Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12450v2","updated":"2023-08-08T01:42:17Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v2.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2308.03977v1","updated":"2023-08-08T01:33:13Z","published":"2023-08-08T01:33:13Z","title":"PUG: Photorealistic and Semantically Controllable Synthetic Data for\n  Representation Learning","summary":"  Synthetic image datasets offer unmatched advantages for designing and\nevaluating deep neural networks: they make it possible to (i) render as many\ndata samples as needed, (ii) precisely control each scene and yield granular\nground truth labels (and captions), (iii) precisely control distribution shifts\nbetween training and testing to isolate variables of interest for sound\nexperimentation. Despite such promise, the use of synthetic image data is still\nlimited -- and often played down -- mainly due to their lack of realism. Most\nworks therefore rely on datasets of real images, which have often been scraped\nfrom public images on the internet, and may have issues with regards to\nprivacy, bias, and copyright, while offering little control over how objects\nprecisely appear. In this work, we present a path to democratize the use of\nphotorealistic synthetic data: we develop a new generation of interactive\nenvironments for representation learning research, that offer both\ncontrollability and realism. We use the Unreal Engine, a powerful game engine\nwell known in the entertainment industry, to produce PUG (Photorealistic Unreal\nGraphics) environments and datasets for representation learning. In this paper,\nwe demonstrate the potential of PUG to enable more rigorous evaluations of\nvision models.\n","authors":["Florian Bordes","Shashank Shekhar","Mark Ibrahim","Diane Bouchacourt","Pascal Vincent","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.03977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10579v3","updated":"2023-08-08T01:32:52Z","published":"2023-07-20T04:45:59Z","title":"SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning","summary":"  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to\nprotect data privacy in vertical federated learning setting. It is widely used\nin fields such as finance and healthcare due to its interpretability,\neffectiveness, and privacy-preserving capability. However, SecureBoost suffers\nfrom high computational complexity and risk of label leakage. To harness the\nfull potential of SecureBoost, hyperparameters of SecureBoost should be\ncarefully chosen to strike an optimal balance between utility, efficiency, and\nprivacy. Existing methods either set hyperparameters empirically or\nheuristically, which are far from optimal. To fill this gap, we propose a\nConstrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto\noptimal solutions that each solution is a set of hyperparameters achieving\noptimal tradeoff between utility loss, training cost, and privacy leakage. We\ndesign measurements of the three objectives. In particular, the privacy leakage\nis measured using our proposed instance clustering attack. Experimental results\ndemonstrate that the CMOSB yields not only hyperparameters superior to the\nbaseline but also optimal sets of hyperparameters that can support the flexible\nrequirements of FL participants.\n","authors":["Ziyao Ren","Yan Kang","Lixin Fan","Linghua Yang","Yongxin Tong","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10579v3.pdf","comment":"FL-IJCAI'23"},{"id":"http://arxiv.org/abs/2303.16459v2","updated":"2023-08-08T01:06:09Z","published":"2023-03-29T05:08:21Z","title":"GNNBuilder: An Automated Framework for Generic Graph Neural Network\n  Accelerator Generation, Simulation, and Optimization","summary":"  There are plenty of graph neural network (GNN) accelerators being proposed.\nHowever, they highly rely on users' hardware expertise and are usually\noptimized for one specific GNN model, making them challenging for practical\nuse. Therefore, in this work, we propose GNNBuilder, the first automated,\ngeneric, end-to-end GNN accelerator generation framework. It features four\nadvantages: (1) GNNBuilder can automatically generate GNN accelerators for a\nwide range of GNN models arbitrarily defined by users; (2) GNNBuilder takes\nstandard PyTorch programming interface, introducing zero overhead for algorithm\ndevelopers; (3) GNNBuilder supports end-to-end code generation, simulation,\naccelerator optimization, and hardware deployment, realizing a push-button\nfashion for GNN accelerator design; (4) GNNBuilder is equipped with accurate\nperformance models of its generated accelerator, enabling fast and flexible\ndesign space exploration (DSE). In the experiments, first, we show that our\naccelerator performance model has errors within $36\\%$ for latency prediction\nand $18\\%$ for BRAM count prediction. Second, we show that our generated\naccelerators can outperform CPU by $6.33\\times$ and GPU by $6.87\\times$. This\nframework is open-source, and the code is available at\nhttps://github.com/sharc-lab/gnn-builder.\n","authors":["Stefan Abi-Karam","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2303.16459v2.pdf","comment":"10 pages, 7 figures, 4 tables, 3 listings"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.04380v1","updated":"2023-08-08T16:31:43Z","published":"2023-08-08T16:31:43Z","title":"Your Negative May not Be True Negative: Boosting Image-Text Matching\n  with False Negative Elimination","summary":"  Most existing image-text matching methods adopt triplet loss as the\noptimization objective, and choosing a proper negative sample for the triplet\nof <anchor, positive, negative> is important for effectively training the\nmodel, e.g., hard negatives make the model learn efficiently and effectively.\nHowever, we observe that existing methods mainly employ the most similar\nsamples as hard negatives, which may not be true negatives. In other words, the\nsamples with high similarity but not paired with the anchor may reserve\npositive semantic associations, and we call them false negatives. Repelling\nthese false negatives in triplet loss would mislead the semantic representation\nlearning and result in inferior retrieval performance. In this paper, we\npropose a novel False Negative Elimination (FNE) strategy to select negatives\nvia sampling, which could alleviate the problem introduced by false negatives.\nSpecifically, we first construct the distributions of positive and negative\nsamples separately via their similarities with the anchor, based on the\nfeatures extracted from image and text encoders. Then we calculate the false\nnegative probability of a given sample based on its similarity with the anchor\nand the above distributions via the Bayes' rule, which is employed as the\nsampling weight during negative sampling process. Since there may not exist any\nfalse negative in a small batch size, we design a memory module with momentum\nto retain a large negative buffer and implement our negative sampling strategy\nspanning over the buffer. In addition, to make the model focus on hard\nnegatives, we reassign the sampling weights for the simple negatives with a\ncut-down strategy. The extensive experiments are conducted on Flickr30K and\nMS-COCO, and the results demonstrate the superiority of our proposed false\nnegative elimination strategy. The code is available at\nhttps://github.com/LuminosityX/FNE.\n","authors":["Haoxuan Li","Yi Bin","Junrong Liao","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04380v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04369v1","updated":"2023-08-08T16:15:35Z","published":"2023-08-08T16:15:35Z","title":"SSTFormer: Bridging Spiking Neural Network and Memory Support\n  Transformer for Frame-Event based Recognition","summary":"  Event camera-based pattern recognition is a newly arising research topic in\nrecent years. Current researchers usually transform the event streams into\nimages, graphs, or voxels, and adopt deep neural networks for event-based\nclassification. Although good performance can be achieved on simple event\nrecognition datasets, however, their results may be still limited due to the\nfollowing two issues. Firstly, they adopt spatial sparse event streams for\nrecognition only, which may fail to capture the color and detailed texture\ninformation well. Secondly, they adopt either Spiking Neural Networks (SNN) for\nenergy-efficient recognition with suboptimal results, or Artificial Neural\nNetworks (ANN) for energy-intensive, high-performance recognition. However,\nseldom of them consider achieving a balance between these two aspects. In this\npaper, we formally propose to recognize patterns by fusing RGB frames and event\nstreams simultaneously and propose a new RGB frame-event recognition framework\nto address the aforementioned issues. The proposed method contains four main\nmodules, i.e., memory support Transformer network for RGB frame encoding,\nspiking neural network for raw event stream encoding, multi-modal bottleneck\nfusion module for RGB-Event feature aggregation, and prediction head. Due to\nthe scarce of RGB-Event based classification dataset, we also propose a\nlarge-scale PokerEvent dataset which contains 114 classes, and 27102\nframe-event pairs recorded using a DVS346 event camera. Extensive experiments\non two RGB-Event based classification datasets fully validated the\neffectiveness of our proposed framework. We hope this work will boost the\ndevelopment of pattern recognition by fusing RGB frames and event streams. Both\nour dataset and source code of this work will be released at\nhttps://github.com/Event-AHU/SSTFormer.\n","authors":["Xiao Wang","Zongzhen Wu","Yao Rong","Lin Zhu","Bo Jiang","Jin Tang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04369v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2306.16181v3","updated":"2023-08-08T15:50:35Z","published":"2023-06-28T13:03:43Z","title":"Learning to Pan-sharpening with Memories of Spatial Details","summary":"  Pan-sharpening, as one of the most commonly used techniques in remote sensing\nsystems, aims to inject spatial details from panchromatic images into\nmultispectral images (MS) to obtain high-resolution multispectral images. Since\ndeep learning has received widespread attention because of its powerful fitting\nability and efficient feature extraction, a variety of pan-sharpening methods\nhave been proposed to achieve remarkable performance. However, current\npan-sharpening methods usually require the paired panchromatic (PAN) and MS\nimages as input, which limits their usage in some scenarios. To address this\nissue, in this paper we observe that the spatial details from PAN images are\nmainly high-frequency cues, i.e., the edges reflect the contour of input PAN\nimages. This motivates us to develop a PAN-agnostic representation to store\nsome base edges, so as to compose the contour for the corresponding PAN image\nvia them. As a result, we can perform the pan-sharpening task with only the MS\nimage when inference. To this end, a memory-based network is adapted to extract\nand memorize the spatial details during the training phase and is used to\nreplace the process of obtaining spatial information from PAN images when\ninference, which is called Memory-based Spatial Details Network (MSDN).\nFinally, we integrate the proposed MSDN module into the existing deep\nlearning-based pan-sharpening methods to achieve an end-to-end pan-sharpening\nnetwork. With extensive experiments on the Gaofen1 and WorldView-4 satellites,\nwe verify that our method constructs good spatial details without PAN images\nand achieves the best performance. The code is available at\nhttps://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.\n","authors":["Maoxun Yuan","Tianyi Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2306.16181v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04343v1","updated":"2023-08-08T15:43:59Z","published":"2023-08-08T15:43:59Z","title":"Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval","summary":"  Most existing cross-modal retrieval methods employ two-stream encoders with\ndifferent architectures for images and texts, \\textit{e.g.}, CNN for images and\nRNN/Transformer for texts. Such discrepancy in architectures may induce\ndifferent semantic distribution spaces and limit the interactions between\nimages and texts, and further result in inferior alignment between images and\ntexts. To fill this research gap, inspired by recent advances of Transformers\nin vision tasks, we propose to unify the encoder architectures with\nTransformers for both modalities. Specifically, we design a cross-modal\nretrieval framework purely based on two-stream Transformers, dubbed\n\\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image\nTransformer, a text Transformer, and a hierarchical alignment module. With such\nidentical architectures, the encoders could produce representations with more\nsimilar characteristics for images and texts, and make the interactions and\nalignments between them much easier. Besides, to leverage the rich semantics,\nwe devise a hierarchical alignment scheme to explore multi-level\ncorrespondences of different layers between images and texts. To evaluate the\neffectiveness of the proposed HAT, we conduct extensive experiments on two\nbenchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that\nHAT outperforms SOTA baselines by a large margin. Specifically, on two key\ntasks, \\textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves\n7.6\\% and 16.7\\% relative score improvement of Recall@1 on MSCOCO, and 4.4\\%\nand 11.6\\% on Flickr30k respectively. The code is available at\n\\url{https://github.com/LuminosityX/HAT}.\n","authors":["Yi Bin","Haoxuan Li","Yahui Xu","Xing Xu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.04343v1.pdf","comment":"Accepted at ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04205v1","updated":"2023-08-08T12:11:19Z","published":"2023-08-08T12:11:19Z","title":"Collaborative Edge Caching: a Meta Reinforcement Learning Approach with\n  Edge Sampling","summary":"  Current learning-based edge caching schemes usually suffer from dynamic\ncontent popularity, e.g., in the emerging short video platforms, users' request\npatterns shift significantly over time and across different edges. An intuitive\nsolution for a specific local edge cache is to collect more request histories\nfrom other edge caches. However, uniformly merging these request histories may\nnot perform satisfactorily due to heterogeneous content distributions on\ndifferent edges. To solve this problem, we propose a collaborative edge caching\nframework. First, we design a meta-learning-based collaborative strategy to\nguarantee that the local model can timely meet the continually changing content\npopularity. Then, we design an edge sampling method to select more \"valuable\"\nneighbor edges to participate in the local training. To evaluate the proposed\nframework, we conduct trace-driven experiments to demonstrate the effectiveness\nof our design: it improves the average cache hit rate by up to $10.12\\%$\n(normalized) compared with other baselines.\n","authors":["Bowei He","Yinan Mao","Shiji Zhou","Chen Ma","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04205v1.pdf","comment":"Published on IEEE International Conference on Multimedia and Expo\n  2023 (ICME2023)"},{"id":"http://arxiv.org/abs/2308.04156v1","updated":"2023-08-08T09:37:18Z","published":"2023-08-08T09:37:18Z","title":"Towards Top-Down Stereoscopic Image Quality Assessment via Stereo\n  Attention","summary":"  Stereoscopic image quality assessment (SIQA) plays a crucial role in\nevaluating and improving the visual experience of 3D content. Existing\nbinocular properties and attention-based methods for SIQA have achieved\npromising performance. However, these bottom-up approaches are inadequate in\nexploiting the inherent characteristics of the human visual system (HVS). This\npaper presents a novel network for SIQA via stereo attention, employing a\ntop-down perspective to guide the quality assessment process. Our proposed\nmethod realizes the guidance from high-level binocular signals down to\nlow-level monocular signals, while the binocular and monocular information can\nbe calibrated progressively throughout the processing pipeline. We design a\ngeneralized Stereo AttenTion (SAT) block to implement the top-down philosophy\nin stereo perception. This block utilizes the fusion-generated attention map as\na high-level binocular modulator, influencing the representation of two\nlow-level monocular features. Additionally, we introduce an Energy Coefficient\n(EC) to account for recent findings indicating that binocular responses in the\nprimate primary visual cortex are less than the sum of monocular responses. The\nadaptive EC can tune the magnitude of binocular response flexibly, thus\nenhancing the formation of robust binocular features within our framework. To\nextract the most discriminative quality information from the summation and\nsubtraction of the two branches of monocular features, we utilize a\ndual-pooling strategy that applies min-pooling and max-pooling operations to\nthe respective branches. Experimental results highlight the superiority of our\ntop-down method in simulating the property of visual perception and advancing\nthe state-of-the-art in the SIQA field. The code of this work is available at\nhttps://github.com/Fanning-Zhang/SATNet.\n","authors":["Huilin Zhang","Sumei Li","Yongli Chang"],"pdf_url":"https://arxiv.org/pdf/2308.04156v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.04132v1","updated":"2023-08-08T08:43:18Z","published":"2023-08-08T08:43:18Z","title":"Optimizing Adaptive Video Streaming with Human Feedback","summary":"  Quality of Experience~(QoE)-driven adaptive bitrate~(ABR) algorithms are\ntypically optimized using QoE models that are based on the mean opinion\nscore~(MOS), while such principles may not account for user heterogeneity on\nrating scales, resulting in unexpected behaviors. In this paper, we propose\n\\texttt{Jade}, which leverages reinforcement learning with human\nfeedback~(RLHF) technologies to better align the users' opinion scores.\n\\texttt{Jade}'s rank-based QoE model considers relative values of user ratings\nto interpret the subjective perception of video sessions. We implement\nlinear-based and Deep Neural Network (DNN)-based architectures for satisfying\nboth accuracy and generalization ability. We further propose entropy-aware\nreinforced mechanisms for training policies with the integration of the\nproposed QoE models. Experimental results demonstrate that \\texttt{Jade}\nperforms favorably on conventional metrics, such as quality and stall ratio,\nand improves QoE by 8.09\\%-38.13\\% in different network conditions, emphasizing\nthe importance of user heterogeneity in QoE modeling and the potential of\ncombining linear-based and DNN-based models for performance improvement.\n","authors":["Tianchi Huang","Rui-Xiao Zhang","Chenglei Wu","Lifeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04132v1.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.04126v1","updated":"2023-08-08T08:30:16Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n  and Infinite Data Generation","summary":"  This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text. Our\ncrafted algorithm leverages advancements across multiple operations such as\nvideo/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models. Future\nprospects include optimizing datasets for each modality to encourage unlimited\ndata generation. This robust base will offer priceless insights to models like\nChatGPT, enabling them to create higher quality datasets for video captioning\nand easing question-answering tasks based on video content. OmniDataComposer\ninaugurates a new stage in multimodal learning, imparting enormous potential\nfor augmenting AI's understanding and generation of complex, real-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04118v1","updated":"2023-08-08T08:17:39Z","published":"2023-08-08T08:17:39Z","title":"Multimodal Color Recommendation in Vector Graphic Documents","summary":"  Color selection plays a critical role in graphic document design and requires\nsufficient consideration of various contexts. However, recommending appropriate\ncolors which harmonize with the other colors and textual contexts in documents\nis a challenging task, even for experienced designers. In this study, we\npropose a multimodal masked color model that integrates both color and textual\ncontexts to provide text-aware color recommendation for graphic documents. Our\nproposed model comprises self-attention networks to capture the relationships\nbetween colors in multiple palettes, and cross-attention networks that\nincorporate both color and CLIP-based text representations. Our proposed method\nprimarily focuses on color palette completion, which recommends colors based on\nthe given colors and text. Additionally, it is applicable for another color\nrecommendation task, full palette generation, which generates a complete color\npalette corresponding to the given text. Experimental results demonstrate that\nour proposed approach surpasses previous color palette completion methods on\naccuracy, color distribution, and user experience, as well as full palette\ngeneration methods concerning color diversity and similarity to the ground\ntruth palettes.\n","authors":["Qianru Qiu","Xueting Wang","Mayu Otani"],"pdf_url":"https://arxiv.org/pdf/2308.04118v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.03463v2","updated":"2023-08-08T07:54:55Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n  Synthesis","summary":"  In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang","Fei Chao"],"pdf_url":"https://arxiv.org/pdf/2308.03463v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.04025v1","updated":"2023-08-08T03:43:24Z","published":"2023-08-08T03:43:24Z","title":"MSAC: Multiple Speech Attribute Control Method for Speech Emotion\n  Recognition","summary":"  Despite significant progress, speech emotion recognition (SER) remains\nchallenging due to inherent complexity and ambiguity of the emotion attribute,\nparticularly in wild world. Whereas current studies primarily focus on\nrecognition and generalization capabilities, this work pioneers an exploration\ninto the reliability of SER methods and investigates how to model the speech\nemotion from the aspect of data distribution across various speech attributes.\nSpecifically, we first build a novel CNN-based SER model which adopts additive\nmargin softmax loss to expand the distance between features of different\nclasses, thereby enhancing their discrimination. Second, a novel multiple\nspeech attribute control method MSAC is proposed to explicitly control speech\nattributes, enabling the model to be less affected by emotion-agnostic\nattributes and capture more fine-grained emotion-related features. Third, we\nmake a first attempt to test and analyze the reliability of the proposed SER\nworkflow using the out-of-distribution detection method. Extensive experiments\non both single and cross-corpus SER scenarios show that our proposed unified\nSER workflow consistently outperforms the baseline in terms of recognition,\ngeneralization, and reliability performance. Besides, in single-corpus SER, the\nproposed SER workflow achieves superior recognition results with a WAR of\n72.97\\% and a UAR of 71.76\\% on the IEMOCAP corpus.\n","authors":["Yu Pan"],"pdf_url":"https://arxiv.org/pdf/2308.04025v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.07848v6","updated":"2023-08-08T03:41:47Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n  Pretraining for Speech Emotion Recognition","summary":"  Contrastive learning based cross-modality pretraining approaches have\nrecently exhibited impressive success in diverse fields. In this paper, we\npropose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive\nlanguage-audio pretraining (CLAP) method for speech emotion recognition.\nSpecifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing\npre-trained WavLM and RoBERTa models. Second, given the significance of the\ngender attribute in speech emotion modeling, two novel soft label based\nGEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)\nmodels are further proposed to integrate emotion and gender information of\nspeech signals, forming more reasonable objectives. Extensive experiments on\nIEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the\nbaseline Emo-CLAP, while also achieving the best recognition performance\ncompared with recent state-of-the-art methods. Noticeably, the proposed\nSL-GEmo-CLAP model achieves the best UAR of 81.43\\% and WAR of 83.16\\% which\nperforms better than other state-of-the-art SER methods by at least 3\\%.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Jixun Yao","Wen Fei","Lei Ma","Heng Lu"],"pdf_url":"https://arxiv.org/pdf/2306.07848v6.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2307.02227v2","updated":"2023-08-08T02:19:48Z","published":"2023-07-05T12:08:56Z","title":"MAE-DFER: Efficient Masked Autoencoder for Self-supervised Dynamic\n  Facial Expression Recognition","summary":"  Dynamic facial expression recognition (DFER) is essential to the development\nof intelligent and empathetic machines. Prior efforts in this field mainly fall\ninto supervised learning paradigm, which is severely restricted by the limited\nlabeled data in existing datasets. Inspired by recent unprecedented success of\nmasked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel\nself-supervised method which leverages large-scale self-supervised pre-training\non abundant unlabeled data to largely advance the development of DFER. Since\nthe vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial\ncomputation during fine-tuning, MAE-DFER develops an efficient local-global\ninteraction Transformer (LGI-Former) as the encoder. Moreover, in addition to\nthe standalone appearance content reconstruction in VideoMAE, MAE-DFER also\nintroduces explicit temporal facial motion modeling to encourage LGI-Former to\nexcavate both static appearance and dynamic motion information. Extensive\nexperiments on six datasets show that MAE-DFER consistently outperforms\nstate-of-the-art supervised methods by significant margins (e.g., +6.30\\% UAR\non DFEW and +8.34\\% UAR on MAFW), verifying that it can learn powerful dynamic\nfacial representations via large-scale self-supervised pre-training. Besides,\nit has comparable or even better performance than VideoMAE, while largely\nreducing the computational cost (about 38\\% FLOPs). We believe MAE-DFER has\npaved a new way for the advancement of DFER and can inspire more relevant\nresearch in this field and even other related tasks. Codes and models are\npublicly available at https://github.com/sunlicai/MAE-DFER.\n","authors":["Licai Sun","Zheng Lian","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2307.02227v2.pdf","comment":"ACM MM 2023 (camera ready). Codes and models are publicly available\n  at https://github.com/sunlicai/MAE-DFER"}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..204ca198
--- /dev/null
+++ b/index.html
@@ -0,0 +1,73533 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-08T00:00:00Z">2023-08-08</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">43</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sewon Min, Suchin Gururangan, Eric Wallace, Hannaneh Hajishirzi, Noah A. Smith, Luke Zettlemoyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The legality of training language models (LMs) on copyrighted or otherwise
+restricted data is under intense debate. However, as we show, model performance
+significantly degrades if trained only on low-risk text (e.g., out-of-copyright
+books or government documents), due to its limited size and domain coverage. We
+present SILO, a new language model that manages this risk-performance tradeoff
+during inference. SILO is built by (1) training a parametric LM on Open License
+Corpus (OLC), a new corpus we curate with 228B tokens of public domain and
+permissively licensed text and (2) augmenting it with a more general and easily
+modifiable nonparametric datastore (e.g., containing copyrighted books or news)
+that is only queried during inference. The datastore allows use of high-risk
+data without training on it, supports sentence-level data attribution, and
+enables data producers to opt out from the model by removing content from the
+store. These capabilities can foster compliance with data-use regulations such
+as the fair use doctrine in the United States and the GDPR in the European
+Union. Our experiments show that the parametric LM struggles on domains not
+covered by OLC. However, access to the datastore greatly improves out of domain
+performance, closing 90% of the performance gap with an LM trained on the Pile,
+a more diverse corpus with mostly high-risk text. We also analyze which
+nonparametric approach works best, where the remaining errors lie, and how
+performance scales with datastore size. Our results suggest that it is possible
+to build high quality language models while mitigating their legal risk.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages; 6 figures. Code, models, and data available at
+  https://github.com/kernelmachine/silo-lm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment
+  Classification and Act Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zheng, Fei Li, Yuyang Chai, Chong Teng, Donghong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The joint task of Dialog Sentiment Classification (DSC) and Act Recognition
+(DAR) aims to predict the sentiment label and act label for each utterance in a
+dialog simultaneously. However, current methods encode the dialog context in
+only one direction, which limits their ability to thoroughly comprehend the
+context. Moreover, these methods overlook the explicit correlations between
+sentiment and act labels, which leads to an insufficient ability to capture
+rich sentiment and act clues and hinders effective and accurate reasoning. To
+address these issues, we propose a Bi-directional Multi-hop Inference Model
+(BMIM) that leverages a feature selection network and a bi-directional
+multi-hop inference network to iteratively extract and integrate rich sentiment
+and act clues in a bi-directional manner. We also employ contrastive learning
+and dual learning to explicitly model the correlations of sentiment and act
+labels. Our experiments on two widely-used datasets show that BMIM outperforms
+state-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1
+score in DSC. Additionally, Our proposed model not only improves the
+performance but also enhances the interpretability of the joint sentiment and
+act prediction task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Character-level NMT and language similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Jon, Ondřej Bojar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the effectiveness of character-level neural machine translation
+using Transformer architecture for various levels of language similarity and
+size of the training dataset on translation between Czech and Croatian, German,
+Hungarian, Slovak, and Spanish. We evaluate the models using automatic MT
+metrics and show that translation between similar languages benefits from
+character-level input segmentation, while for less related languages,
+character-level vanilla Transformer-base often lags behind subword-level
+segmentation. We confirm previous findings that it is possible to close the gap
+by finetuning the already trained subword-level models to character-level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Evaluation Models from Large Language Models for Sequence
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenglong Wang, Hang Zhou, Kaiyan Chang, Tongran Liu, Chunliang Zhang, Quan Du, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models achieve state-of-the-art performance on sequence
+generation evaluation, but typically have a large number of parameters. This is
+a computational challenge as presented by applying their evaluation capability
+at scale. To overcome the challenge, in this paper, we propose \textbf{ECT}, an
+\textbf{e}valuation \textbf{c}apability \textbf{t}ransfer method, to transfer
+the evaluation capability from LLMs to relatively lightweight language models.
+Based on the proposed ECT, we learn various evaluation models from ChatGPT, and
+employ them as reward models to improve sequence generation models via
+reinforcement learning and reranking approaches. Experimental results on
+machine translation, text style transfer, and summarization tasks demonstrate
+the effectiveness of our ECT. Notably, applying the learned evaluation models
+to sequence generation models results in better generated sequences as
+evaluated by commonly used metrics and ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unmasking Nationality Bias: A Study of Human Perception of Nationalities
+  in AI-Generated Articles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Narayanan Venkit, Sanjana Gautam, Ruchi Panchanadikar, Ting-Hao `Kenneth' Huang, Shomir Wilson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the potential for nationality biases in natural language
+processing (NLP) models using human evaluation methods. Biased NLP models can
+perpetuate stereotypes and lead to algorithmic discrimination, posing a
+significant challenge to the fairness and justice of AI systems. Our study
+employs a two-step mixed-methods approach that includes both quantitative and
+qualitative analysis to identify and understand the impact of nationality bias
+in a text generation model. Through our human-centered quantitative analysis,
+we measure the extent of nationality bias in articles generated by AI sources.
+We then conduct open-ended interviews with participants, performing qualitative
+coding and thematic analysis to understand the implications of these biases on
+human readers. Our findings reveal that biased NLP models tend to replicate and
+amplify existing societal biases, which can translate to harm if used in a
+sociotechnical setting. The qualitative analysis from our interviews offers
+insights into the experience readers have when encountering such articles,
+highlighting the potential to shift a reader's perception of a country. These
+findings emphasize the critical role of public perception in shaping AI's
+impact on society and the need to correct biases in AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards an AI to Win Ghana's National Science and Maths Quiz <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Boateng, Jonathan Abrefah Mensah, Kevin Takyi Yeboah, William Edor, Andrew Kojo Mensah-Onumah, Naafi Dasana Ibrahim, Nana Sam Yeboah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can an AI win Ghana's National Science and Maths Quiz (NSMQ)? That is the
+question we seek to answer in the NSMQ AI project, an open-source project that
+is building AI to compete live in the NSMQ and win. The NSMQ is an annual live
+science and mathematics competition for senior secondary school students in
+Ghana in which 3 teams of 2 students compete by answering questions across
+biology, chemistry, physics, and math in 5 rounds over 5 progressive stages
+until a winning team is crowned for that year. The NSMQ is an exciting live
+quiz competition with interesting technical challenges across speech-to-text,
+text-to-speech, question-answering, and human-computer interaction. In this
+ongoing work that began in January 2023, we give an overview of the project,
+describe each of the teams, progress made thus far, and the next steps toward
+our planned launch and debut of the AI in October for NSMQ 2023. An AI that
+conquers this grand challenge can have real-world impact on education such as
+enabling millions of students across Africa to have one-on-one learning support
+from this AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages. Under review at Deep Learning Indaba and Black in AI
+  Workshop @NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-Based Knowledge Injection for Metaphor Detection: A
+  Comprehensive <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Yang, Wenye Zhao, Qingbao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The history of metaphor research also marks the evolution of knowledge
+infusion research. With the continued advancement of deep learning techniques
+in recent years, the natural language processing community has shown great
+interest in applying knowledge to successful results in metaphor recognition
+tasks. Although there has been a gradual increase in the number of approaches
+involving knowledge injection in the field of metaphor recognition, there is a
+lack of a complete review article on knowledge injection based approaches.
+Therefore, the goal of this paper is to provide a comprehensive review of
+research advances in the application of deep learning for knowledge injection
+in metaphor recognition tasks. In this paper, we systematically summarize and
+generalize the mainstream knowledge and knowledge injection principles, as well
+as review the datasets, evaluation metrics, and benchmark models used in
+metaphor recognition tasks. Finally, we explore the current issues facing
+knowledge injection methods and provide an outlook on future research
+directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of the wav2vec 2.0 Feature Extractor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Vieting, Ralf Schlüter, Hermann Ney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems typically use handcrafted feature
+extraction pipelines. To avoid their inherent information loss and to achieve
+more consistent modeling from speech to transcribed text, neural raw waveform
+feature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,
+which has recently gained large popularity, uses a convolutional FE which
+operates directly on the speech waveform. However, it is not yet studied
+extensively in the literature. In this work, we study its capability to replace
+the standard feature extraction methods in a connectionist temporal
+classification (CTC) ASR model and compare it to an alternative neural FE. We
+show that both are competitive with traditional FEs on the LibriSpeech
+benchmark and analyze the effect of the individual components. Furthermore, we
+analyze the learned filters and show that the most important information for
+the ASR system is obtained by a set of bandpass filters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ITG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Alignment: Chat with Vanilla Language Models Before
+  Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we explore inference-time alignment through in-context
+learning. We consider a vanilla pretrained language model Llama-2 before any
+fine-tuning and retrieve an average of 9 demonstration alignment examples when
+the model is prompted to follow chat-style instructions. Compared to direct
+prompting, the in-context alignment without changing model weights leads to a
+7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making
+the vanilla language model comparable to strong baselines with alignment
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic
+  Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luka Terčon, Nikola Ljubešić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of
+the South Slavic languages, which is based on the Stanza natural language
+processing pipeline. We describe the main improvements in CLASSLA-Stanza with
+respect to Stanza, and give a detailed description of the model training
+process for the latest 2.1 release of the pipeline. We also report performance
+scores produced by the pipeline for different languages and varieties.
+CLASSLA-Stanza exhibits consistently high performance across all the supported
+languages and outperforms or expands its parent pipeline Stanza at all the
+supported tasks. We also present the pipeline's new functionality enabling
+efficient processing of web data and the reasons that led to its
+implementation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 14 tables, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gloss Alignment Using Word Embeddings <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Walsh, Ozge Mercanoglu Sincan, Ben Saunders, Richard Bowden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing and annotating Sign language datasets is a time consuming and
+costly process. Current datasets are orders of magnitude too small to
+successfully train unconstrained \acf{slt} models. As a result, research has
+turned to TV broadcast content as a source of large-scale training data,
+consisting of both the sign language interpreter and the associated audio
+subtitle. However, lack of sign language annotation limits the usability of
+this data and has led to the development of automatic annotation techniques
+such as sign spotting. These spottings are aligned to the video rather than the
+subtitle, which often results in a misalignment between the subtitle and
+spotted signs. In this paper we propose a method for aligning spottings with
+their corresponding subtitles using large spoken language models. Using a
+single modality means our method is computationally inexpensive and can be
+utilized in conjunction with existing alignment techniques. We quantitatively
+demonstrate the effectiveness of our method on the \acf{mdgs} and \acf{bobsl}
+datasets, recovering up to a 33.22 BLEU-1 score in word alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 4 figures, 2023 IEEE International Conference on Acoustics,
+  Speech, and Signal Processing Workshops (ICASSPW)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Retrieval-Augmented Generation for Real-time Composition
+  Assistance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuchao Zhang, Menglin Xia, Camille Couturier, Guoqing Zheng, Saravan Rajmohan, Victor Ruhle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval augmented models show promise in enhancing traditional language
+models by improving their contextual understanding, integrating private data,
+and reducing hallucination. However, the processing time required for retrieval
+augmented large language models poses a challenge when applying them to tasks
+that require real-time responses, such as composition assistance.
+  To overcome this limitation, we propose the Hybrid Retrieval-Augmented
+Generation (HybridRAG) framework that leverages a hybrid setting that combines
+both client and cloud models. HybridRAG incorporates retrieval-augmented memory
+generated asynchronously by a Large Language Model (LLM) in the cloud. By
+integrating this retrieval augmented memory, the client model acquires the
+capability to generate highly effective responses, benefiting from the LLM's
+capabilities. Furthermore, through asynchronous memory integration, the client
+model is capable of delivering real-time responses to user requests without the
+need to wait for memory synchronization from the cloud. Our experiments on
+Wikitext and Pile subsets show that HybridRAG achieves lower latency than a
+cloud-based retrieval-augmented LLM, while outperforming client-only models in
+utility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Socially Unacceptable Discourse Classification (SUD) through
+  different eyes: "Are we on the same page ?" 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Machado Carneiro, Michele Linardi, Julien Longhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Socially Unacceptable Discourse (SUD) characterization and detection
+in online text. We first build and present a novel corpus that contains a large
+variety of manually annotated texts from different online sources used so far
+in state-of-the-art Machine learning (ML) SUD detection solutions. This global
+context allows us to test the generalization ability of SUD classifiers that
+acquire knowledge around the same SUD categories, but from different contexts.
+From this perspective, we can analyze how (possibly) different annotation
+modalities influence SUD learning by discussing open challenges and open
+research directions. We also provide several data insights which can support
+domain experts in the annotation task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Monotonic Aggregation for Open-domain QA <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sang-eun Han, Yeonseok Jeong, Seung-won Hwang, Kyungjae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering (QA) is a critical task for speech-based retrieval from
+knowledge sources, by sifting only the answers without requiring to read
+supporting documents. Specifically, open-domain QA aims to answer user
+questions on unrestricted knowledge sources. Ideally, adding a source should
+not decrease the accuracy, but we find this property (denoted as
+"monotonicity") does not hold for current state-of-the-art methods. We identify
+the cause, and based on that we propose Judge-Specialist framework. Our
+framework consists of (1) specialist retrievers/readers to cover individual
+sources, and (2) judge, a dedicated language model to select the final answer.
+Our experiments show that our framework not only ensures monotonicity, but also
+outperforms state-of-the-art multi-source QA methods on Natural Questions.
+Additionally, we show that our models robustly preserve the monotonicity
+against noise from speech recognition. We publicly release our code and
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model <span class="highlight-title">Prompt</span> Chaining for Long Legal Document
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dietrich Trautmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompting is used to guide or steer a language model in generating an
+appropriate response that is consistent with the desired outcome. Chaining is a
+strategy used to decompose complex tasks into smaller, manageable components.
+In this study, we utilize prompt chaining for extensive legal document
+classification tasks, which present difficulties due to their intricate
+domain-specific language and considerable length. Our approach begins with the
+creation of a concise summary of the original document, followed by a semantic
+search for related exemplar texts and their corresponding annotations from a
+training corpus. Finally, we prompt for a label - based on the task - to
+assign, by leveraging the in-context learning from the few-shot prompt. We
+demonstrate that through prompt chaining, we can not only enhance the
+performance over zero-shot, but also surpass the micro-F1 score achieved by
+larger models, such as ChatGPT zero-shot, using smaller models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SwissText 2023 Late Breaking Work (Generative AI & LLM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Social Media, Topic Modeling and Sentiment Analysis in Municipal
+  Decision Support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Švaňa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many cities around the world are aspiring to become. However, smart
+initiatives often give little weight to the opinions of average citizens.
+  Social media are one of the most important sources of citizen opinions. This
+paper presents a prototype of a framework for processing social media posts
+with municipal decision-making in mind. The framework consists of a sequence of
+three steps: (1) determining the sentiment polarity of each social media post
+(2) identifying prevalent topics and mapping these topics to individual posts,
+and (3) aggregating these two pieces of information into a fuzzy number
+representing the overall sentiment expressed towards each topic. Optionally,
+the fuzzy number can be reduced into a tuple of two real numbers indicating the
+"amount" of positive and negative opinion expressed towards each topic.
+  The framework is demonstrated on tweets published from Ostrava, Czechia over
+a period of about two months. This application illustrates how fuzzy numbers
+represent sentiment in a richer way and capture the diversity of opinions
+expressed on social media.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collective Human Opinions in Semantic Textual Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxia Wang, Shimin Tao, Ning Xie, Hao Yang, Timothy Baldwin, Karin Verspoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the subjective nature of semantic textual similarity (STS) and
+pervasive disagreements in STS annotation, existing benchmarks have used
+averaged human ratings as the gold standard. Averaging masks the true
+distribution of human opinions on examples of low agreement, and prevents
+models from capturing the semantic vagueness that the individual ratings
+represent. In this work, we introduce USTS, the first Uncertainty-aware STS
+dataset with ~15,000 Chinese sentence pairs and 150,000 labels, to study
+collective human opinions in STS. Analysis reveals that neither a scalar nor a
+single Gaussian fits a set of observed judgements adequately. We further show
+that current STS models cannot capture the variance caused by human
+disagreement on individual instances, but rather reflect the predictive
+confidence over the aggregate dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I-WAS: a Data Augmentation Method with <span class="highlight-title">GPT</span>-2 for Simile Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongzhu Chang, Rongsheng Zhang, Jiashu Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simile detection is a valuable task for many natural language processing
+(NLP)-based applications, particularly in the field of literature. However,
+existing research on simile detection often relies on corpora that are limited
+in size and do not adequately represent the full range of simile forms. To
+address this issue, we propose a simile data augmentation method based on
+\textbf{W}ord replacement And Sentence completion using the GPT-2 language
+model. Our iterative process called I-WAS, is designed to improve the quality
+of the augmented sentences. To better evaluate the performance of our method in
+real-world applications, we have compiled a corpus containing a more diverse
+set of simile forms for experimentation. Our experimental results demonstrate
+the effectiveness of our proposed data augmentation method for simile
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DataTales: Investigating the use of Large Language Models for Authoring
+  Data-Driven Articles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicole Sultanum, Arjun Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Authoring data-driven articles is a complex process requiring authors to not
+only analyze data for insights but also craft a cohesive narrative that
+effectively communicates the insights. Text generation capabilities of
+contemporary large language models (LLMs) present an opportunity to assist the
+authoring of data-driven articles and expedite the writing process. In this
+work, we investigate the feasibility and perceived value of leveraging LLMs to
+support authors of data-driven articles. We designed a prototype system,
+DataTales, that leverages a LLM to generate textual narratives accompanying a
+given chart. Using DataTales as a design probe, we conducted a qualitative
+study with 11 professionals to evaluate the concept, from which we distilled
+affordances and opportunities to further integrate LLMs as valuable data-driven
+article authoring assistants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Five-Dollar Model: Generating Game Maps and Sprites from Sentence
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Merino, Roman Negri, Dipika Rajesh, M Charity, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The five-dollar model is a lightweight text-to-image generative architecture
+that generates low dimensional images from an encoded text prompt. This model
+can successfully generate accurate and aesthetically pleasing content in low
+dimensional domains, with limited amounts of training data. Despite the small
+size of both the model and datasets, the generated images are still able to
+maintain the encoded semantic meaning of the textual prompt. We apply this
+model to three small datasets: pixel art video game maps, video game sprite
+images, and down-scaled emoji images and apply novel augmentation strategies to
+improve the performance of our model on these limited datasets. We evaluate our
+models performance using cosine similarity score between text-image pairs
+generated by the CLIP VIT-B/32 model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in AIIDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InfeRE: Step-by-Step Regex Generation via Chain of Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Zhang, Xiaodong Gu, Yuting Chen, Beijun Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically generating regular expressions (abbrev. regexes) from natural
+language description (NL2RE) has been an emerging research area. Prior studies
+treat regex as a linear sequence of tokens and generate the final expressions
+autoregressively in a single pass. They did not take into account the
+step-by-step internal text-matching processes behind the final results. This
+significantly hinders the efficacy and interpretability of regex generation by
+neural language models. In this paper, we propose a new paradigm called InfeRE,
+which decomposes the generation of regexes into chains of step-by-step
+inference. To enhance the robustness, we introduce a self-consistency decoding
+mechanism that ensembles multiple outputs sampled from different models. We
+evaluate InfeRE on two publicly available datasets, NL-RX-Turk and KB13, and
+compare the results with state-of-the-art approaches and the popular tree-based
+generation approach TRANX. Experimental results show that InfeRE substantially
+outperforms previous baselines, yielding 16.3% and 14.7% improvement in DFA@5
+accuracy on two datasets, respectively. Particularly, InfeRE outperforms the
+popular tree-based generation approach by 18.1% and 11.3% on both datasets,
+respectively, in terms of DFA@5 accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ASE'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study on TF-IDF feature Weighting Method and its Analysis
+  using Unstructured <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamata Das, Selvakumar K., P. J. A. Alphonse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text Classification is the process of categorizing text into the relevant
+categories and its algorithms are at the core of many Natural Language
+Processing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP
+are the most highly used information retrieval methods in text classification.
+We have investigated and analyzed the feature weighting method for text
+classification on unstructured data. The proposed model considered two features
+N-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset
+for sentiment analysis. Then we have used the state-of-the-art classifier to
+validate the method i.e., Support Vector Machine (SVM), Logistic Regression,
+Multinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and
+k-nearest neighbors (KNN). From those two feature extractions, a significant
+increase in feature extraction with TF-IDF features rather than based on
+N-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall
+(93.81%), and F1-score (91.99%) value in Random Forest classifier.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, COLINS-2021, 5th International Conference on
+  Computational Linguistics and Intelligent Systems, April 22-23, 2021,
+  Kharkiv, Ukraine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Top K Relevant Passage Retrieval for Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering is a task that answers factoid questions using a large
+collection of documents. It aims to provide precise answers in response to the
+user's questions in natural language. Question answering relies on efficient
+passage retrieval to select candidate contexts, where traditional sparse vector
+space models, such as TF-IDF or BM25, are the de facto method. On the web,
+there is no single article that could provide all the possible answers
+available on the internet to the question of the problem asked by the user. The
+existing Dense Passage Retrieval model has been trained on Wikipedia dump from
+Dec. 20, 2018, as the source documents for answering questions. Question
+answering (QA) has made big strides with several open-domain and machine
+comprehension systems built using large-scale annotated datasets. However, in
+the clinical domain, this problem remains relatively unexplored. According to
+multiple surveys, Biomedical Questions cannot be answered correctly from
+Wikipedia Articles. In this work, we work on the existing DPR framework for the
+biomedical domain and retrieve answers from the Pubmed articles which is a
+reliable source to answer medical questions. When evaluated on a BioASQ QA
+dataset, our fine-tuned dense retriever results in a 0.81 F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. arXiv admin note: text overlap with
+  arXiv:2004.04906 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual <span class="highlight-title">Pre-Train</span>ing of Large Language Models: How to (re)warm your
+  model? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats L. Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, Timothée Lesort
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are routinely pre-trained on billions of tokens,
+only to restart the process over again once new data becomes available. A much
+cheaper and more efficient solution would be to enable the continual
+pre-training of these models, i.e. updating pre-trained models with new data
+instead of re-training them from scratch. However, the distribution shift
+induced by novel data typically results in degraded performance on past data.
+Taking a step towards efficient continual pre-training, in this work, we
+examine the effect of different warm-up strategies. Our hypothesis is that the
+learning rate must be re-increased to improve compute efficiency when training
+on a new dataset. We study the warmup phase of models pre-trained on the Pile
+(upstream data, 300B tokens) as we continue to pre-train on SlimPajama
+(downstream data, 297B tokens), following a linear warmup and cosine decay
+schedule. We conduct all experiments on the Pythia 410M language model
+architecture and evaluate performance through validation perplexity. We
+experiment with different pre-training checkpoints, various maximum learning
+rates, and various warmup lengths. Our results show that while rewarming models
+first increases the loss on upstream and downstream data, in the longer run it
+improves the downstream performance, outperforming models trained from
+scratch$\unicode{x2013}$even for a large downstream dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimplyRetrieve: A Private and Lightweight Retrieval-Centric Generative
+  AI Tool 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youyang Ng, Daisuke Miyashita, Yasuto Hoshi, Yasuhiro Morioka, Osamu Torii, Tomoya Kodama, Jun Deguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Model (LLM) based Generative AI systems have seen significant
+progress in recent years. Integrating a knowledge retrieval architecture allows
+for seamless integration of private data into publicly available Generative AI
+systems using pre-trained LLM without requiring additional model fine-tuning.
+Moreover, Retrieval-Centric Generation (RCG) approach, a promising future
+research direction that explicitly separates roles of LLMs and retrievers in
+context interpretation and knowledge memorization, potentially leads to more
+efficient implementation. SimplyRetrieve is an open-source tool with the goal
+of providing a localized, lightweight, and user-friendly interface to these
+sophisticated advancements to the machine learning community. SimplyRetrieve
+features a GUI and API based RCG platform, assisted by a Private Knowledge Base
+Constructor and a Retrieval Tuning Module. By leveraging these capabilities,
+users can explore the potential of RCG for improving generative AI performance
+while maintaining privacy standards. The tool is available at
+https://github.com/RCGAI/SimplyRetrieve with an MIT license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gzip versus bag-of-words for text classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15002v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15002v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juri Opitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of compression in text classification ('gzip') has recently
+garnered lots of attention. In this note we show that `bag-of-words' approaches
+can achieve similar or better results, and are more efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>improved writing, extended with more results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Calibration through Prior Adaptation for Text
+  Classification using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lautaro Estienne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide variety of natural language tasks are currently being addressed with
+large-scale language models (LLMs). These models are usually trained with a
+very large amount of unsupervised text data and adapted to perform a downstream
+natural language task using methods like fine-tuning, calibration or in-context
+learning. In this work, we propose an approach to adapt the prior class
+distribution to perform text classification tasks without the need for labelled
+samples and only few in-domain sample queries. The proposed approach treats the
+LLM as a black box, adding a stage where the model posteriors are calibrated to
+the task. Results show that these methods outperform the un-adapted model for
+different number of training shots in the prompt and a previous approach were
+calibration is performed without using any adaptation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. The code, prompts, and auxiliary text dataset is
+available at https://github.com/mayug/VDT-Adapter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ICCV-W 2023. V2 contains additional comparisons
+  with concurrent works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Arabic Named Entity Recognition: Past, Recent Advances, and
+  Future Trends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03512v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03512v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoye Qu, Yingjie Gu, Qingrong Xia, Zechang Li, Zhefeng Wang, Baoxing Huai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As more and more Arabic texts emerged on the Internet, extracting important
+information from these Arabic texts is especially useful. As a fundamental
+technology, Named entity recognition (NER) serves as the core component in
+information extraction technology, while also playing a critical role in many
+other Natural Language Processing (NLP) systems, such as question answering and
+knowledge graph building. In this paper, we provide a comprehensive review of
+the development of Arabic NER, especially the recent advances in deep learning
+and pre-trained language model. Specifically, we first introduce the background
+of Arabic NER, including the characteristics of Arabic and existing resources
+for Arabic NER. Then, we systematically review the development of Arabic NER
+methods. Traditional Arabic NER systems focus on feature engineering and
+designing domain-specific rules. In recent years, deep learning methods achieve
+significant progress by representing texts via continuous vector
+representations. With the growth of pre-trained language model, Arabic NER
+yields better performance. Finally, we conclude the method gap between Arabic
+NER and NER methods from other languages, which helps outline future directions
+for Arabic NER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TKDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Large Language Models Really Good Logical Reasoners? A Comprehensive
+  Evaluation and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09841v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09841v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangzhi Xu, Qika Lin, Jiawei Han, Tianzhe Zhao, Jun Liu, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logical reasoning consistently plays a fundamental and significant role in
+the domains of knowledge engineering and artificial intelligence. Recently,
+Large Language Models (LLMs) have emerged as a noteworthy innovation in natural
+language processing (NLP), exhibiting impressive achievements across various
+classic NLP tasks. However, the question of whether LLMs can effectively
+address the task of logical reasoning, which requires gradual cognitive
+inference similar to human intelligence, remains unanswered. To this end, we
+aim to bridge this gap and provide comprehensive evaluations in this paper.
+Firstly, to offer systematic evaluations, we select fifteen typical logical
+reasoning datasets and organize them into deductive, inductive, abductive and
+mixed-form reasoning settings. Considering the comprehensiveness of
+evaluations, we include three representative LLMs (i.e., text-davinci-003,
+ChatGPT and BARD) and evaluate them on all selected datasets under zero-shot,
+one-shot and three-shot settings. Secondly, different from previous evaluations
+relying only on simple metrics (e.g., accuracy), we propose fine-level
+evaluations from objective and subjective manners, covering both answers and
+explanations. Additionally, to uncover the logical flaws of LLMs, problematic
+cases will be attributed to five error types from two dimensions, i.e.,
+evidence selection process and reasoning process. Thirdly, to avoid the
+influences of knowledge bias and purely focus on benchmarking the logical
+reasoning capability of LLMs, we propose a new dataset with neutral content. It
+contains 3,000 samples and covers deductive, inductive and abductive settings.
+Based on the in-depth evaluations, this paper finally forms a general
+evaluation scheme of logical reasoning capability from six dimensions. It
+reflects the pros and cons of LLMs and gives guiding directions for future
+works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Mathematical Derivations with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09998v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09998v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Meadows, Marco Valentino, Andre Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The derivation of mathematical results in specialised fields, using Large
+Language Models (LLMs), is an emerging research direction that can help
+identify models' limitations, and potentially support mathematical discovery.
+In this paper, we leverage a symbolic engine to generate derivations of
+equations at scale, and investigate the capabilities of LLMs when deriving goal
+equations from premises. Specifically, we employ in-context learning for GPT
+and fine-tune a range of T5 models to compare the robustness and generalisation
+of pre-training strategies to specialised models. Empirical results show that
+fine-tuned FLAN-T5-large (MathT5) outperforms GPT models on all static and
+out-of-distribution test sets in conventional scores. However, an in-depth
+analysis reveals that the fine-tuned models are more sensitive to perturbations
+involving unseen symbols and (to a lesser extent) changes to equation
+structure. In addition, we analyse 1.7K equations, and over 200 derivations, to
+highlight common reasoning errors such as the inclusion of incorrect,
+irrelevant, and redundant equations. Finally, we explore the suitability of
+existing metrics for evaluating mathematical derivations and find evidence
+that, while they can capture general properties such as sensitivity to
+perturbations, they fail to highlight fine-grained reasoning errors and
+essential differences between models. Overall, this work demonstrates that
+training models on synthetic data may improve their math capabilities beyond
+much larger LLMs, but current metrics are not appropriately assessing the
+quality of generated mathematical text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topological Interpretations of <span class="highlight-title">GPT</span>-3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Sun, Bradley Nelson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is an experiential study of investigating a consistent method for
+deriving the correlation between sentence vector and semantic meaning of a
+sentence. We first used three state-of-the-art word/sentence embedding methods
+including GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence
+strings into high dimensional spaces. Then we compute the pairwise distance
+between any possible combination of two sentence vectors in an embedding space
+and map them into a matrix. Based on each distance matrix, we compute the
+correlation of distances of a sentence vector with respect to the other
+sentence vectors in an embedding space. Then we compute the correlation of each
+pair of the distance matrices. We observed correlations of the same sentence in
+different embedding spaces and correlations of different sentences in the same
+embedding space. These observations are consistent with our hypothesis and take
+us to the next stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>70 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Speech Separation based on Contrastive Learning and Deep Modularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Ochieng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current monaural state of the art tools for speech separation relies on
+supervised learning. This means that they must deal with permutation problem,
+they are impacted by the mismatch on the number of speakers used in training
+and inference. Moreover, their performance heavily relies on the presence of
+high-quality labelled data. These problems can be effectively addressed by
+employing a fully unsupervised technique for speech separation. In this paper,
+we use contrastive learning to establish the representations of frames then use
+the learned representations in the downstream deep modularization task.
+Concretely, we demonstrate experimentally that in speech separation, different
+frames of a speaker can be viewed as augmentations of a given hidden standard
+frame of that speaker. The frames of a speaker contain enough prosodic
+information overlap which is key in speech separation. Based on this, we
+implement a self-supervised learning to learn to minimize the distance between
+frames belonging to a given speaker. The learned representations are used in a
+downstream deep modularization task to cluster frames based on speaker
+identity. Evaluation of the developed technique on WSJ0-2mix and WSJ0-3mix
+shows that the technique attains SI-SNRi and SDRi of 20.8 and 21.0 respectively
+in WSJ0-2mix. In WSJ0-3mix, it attains SI-SNRi and SDRi of 20.7 and 20.7
+respectively in WSJ0-2mix. Its greatest strength being that as the number of
+speakers increase, its performance does not degrade significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2212.00369</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Large Language Models for Topic Classification in the Domain
+  of Public Affairs <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Peña, Aythami Morales, Julian Fierrez, Ignacio Serna, Javier Ortega-Garcia, Iñigo Puente, Jorge Cordova, Gonzalo Cordova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis of public affairs documents is crucial for citizens as it
+promotes transparency, accountability, and informed decision-making. It allows
+citizens to understand government policies, participate in public discourse,
+and hold representatives accountable. This is crucial, and sometimes a matter
+of life or death, for companies whose operation depend on certain regulations.
+Large Language Models (LLMs) have the potential to greatly enhance the analysis
+of public affairs documents by effectively processing and understanding the
+complex language used in such documents. In this work, we analyze the
+performance of LLMs in classifying public affairs documents. As a natural
+multi-label task, the classification of these documents presents important
+challenges. In this work, we use a regex-powered tool to collect a database of
+public affairs documents with more than 33K samples and 22.5M tokens. Our
+experiments assess the performance of 4 different Spanish LLMs to classify up
+to 30 different topics in the data in different configurations. The results
+shows that LLMs can be of great use to process domain-specific documents, such
+as those in the domain of public affairs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICDAR 2023 Workshop on Automatic Domain-Adapted and
+  Personalized Document Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain
+  Adapted Least-To-Most <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Arora, Shabbirhussain Bhaisaheb, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain and cross-compositional generalization of Text-to-SQL semantic
+parsing is a challenging task. Existing Large Language Model (LLM) based
+solutions rely on inference-time retrieval of few-shot exemplars from the
+training set to synthesize a run-time prompt for each Natural Language (NL)
+test query. In contrast, we devise an algorithm which performs offline sampling
+of a minimal set-of few-shots from the training data, with complete coverage of
+SQL clauses, operators and functions, and maximal domain coverage within the
+allowed token length. This allows for synthesis of a fixed Generic Prompt (GP),
+with a diverse set-of exemplars common across NL test queries, avoiding
+expensive test time exemplar retrieval. We further auto-adapt the GP to the
+target database domain (DA-GP), to better handle cross-domain generalization;
+followed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle
+cross-compositional generalization. The synthesis of LTMP-DA-GP is an offline
+task, to be performed one-time per new database with minimal human
+intervention. Our approach demonstrates superior performance on the KaggleDBQA
+dataset, designed to evaluate generalizability for the Text-to-SQL task. We
+further showcase consistent performance improvement of LTMP-DA-GP over GP,
+across LLMs and databases of KaggleDBQA, highlighting the efficacy and model
+agnostic benefits of our prompt based adapt and decompose approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Whats New? Identifying the Unfolding of New Events in Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07748v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07748v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mahed Mousavi, Shohei Tanaka, Gabriel Roccabruna, Koichiro Yoshino, Satoshi Nakamura, Giuseppe Riccardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Narratives include a rich source of events unfolding over time and context.
+Automatic understanding of these events provides a summarised comprehension of
+the narrative for further computation (such as reasoning). In this paper, we
+study the Information Status (IS) of the events and propose a novel challenging
+task: the automatic identification of new events in a narrative. We define an
+event as a triplet of subject, predicate, and object. The event is categorized
+as new with respect to the discourse context and whether it can be inferred
+through commonsense reasoning. We annotated a publicly available corpus of
+narratives with the new events at sentence level using human annotators. We
+present the annotation protocol and study the quality of the annotation and the
+difficulty of the task. We publish the annotated dataset, annotation materials,
+and machine learning baseline models for the task of new event extraction for
+narrative understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recycle<span class="highlight-title">GPT</span>: An Autoregressive Language Model with Recyclable Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Jiang, Qiaozhi He, Xiaomin Zhuang, Zhihua Wu, Kunpeng Wang, Wenlai Zhao, Guangwen Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing large language models have to run K times to generate a sequence of
+K tokens. In this paper, we present RecycleGPT, a generative language model
+with fast decoding speed by recycling pre-generated model states without
+running the whole model in multiple steps. Our approach relies on the
+observation that adjacent tokens in a sequence usually have strong correlations
+and the next token in a sequence can be reasonably guessed or inferred based on
+the preceding ones. Experiments and analysis demonstrate the effectiveness of
+our approach in lowering inference latency, achieving up to 1.4x speedup while
+preserving high performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Controllable Text Generation using <span class="highlight-title">Transformer</span>-based
+  <span class="highlight-title">Pre-train</span>ed Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.05337v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.05337v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanqing Zhang, Haolin Song, Shaoyu Li, Ming Zhou, Dawei Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable Text Generation (CTG) is emerging area in the field of natural
+language generation (NLG). It is regarded as crucial for the development of
+advanced text generation technologies that better meet the specific constraints
+in practical applications. In recent years, methods using large-scale
+pre-trained language models (PLMs), in particular the widely used
+transformer-based PLMs, have become a new paradigm of NLG, allowing generation
+of more diverse and fluent text. However, due to the limited level of
+interpretability of deep neural networks, the controllability of these methods
+need to be guaranteed. To this end, controllable text generation using
+transformer-based PLMs has become a rapidly growing yet challenging new
+research hotspot. A diverse range of approaches have emerged in the recent 3-4
+years, targeting different CTG tasks that require different types of controlled
+constraints. In this paper, we present a systematic critical review on the
+common tasks, main approaches, and evaluation methods in this area. Finally, we
+discuss the challenges that the field is facing, and put forward various
+promising future directions. To the best of our knowledge, this is the first
+survey paper to summarize the state-of-the-art CTG techniques from the
+perspective of Transformer-based PLMs. We hope it can help researchers and
+practitioners in the related fields to quickly track the academic and
+technological frontier, providing them with a landscape of the area and a
+roadmap for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Reusability of <span class="highlight-title">Pre-train</span>ed Language Models in Real-world
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10457v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10457v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somayeh Ghanbarzadeh, Hamid Palangi, Yan Huang, Radames Cruz Moreno, Hamed Khanpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is
+often limited by their generalization problem, where their performance
+drastically decreases when evaluated on examples that differ from the training
+dataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation
+arises from PLMs' reliance on spurious correlations, which work well for
+frequent example types but not for general examples. To address this issue, we
+propose a training approach called Mask-tuning, which integrates Masked
+Language Modeling (MLM) training objectives into the fine-tuning process to
+enhance PLMs' generalization. Comprehensive experiments demonstrate that
+Mask-tuning surpasses current state-of-the-art techniques and enhances PLMs'
+generalization on OOD datasets while improving their performance on
+in-distribution datasets. The findings suggest that Mask-tuning improves the
+reusability of PLMs on unseen data, making them more practical and effective
+for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper and awarded as the BEST Resaerch Paper in
+  IEEE IRI'23 (IEEE 24th International conference on Information Reuse and
+  Integrationfor Data Science)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Yanni Hu, Yuguang Yang, Jixun Yao, Wen Fei, Lei Ma, Heng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based cross-modality pretraining approaches have
+recently exhibited impressive success in diverse fields. In this paper, we
+propose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive
+language-audio pretraining (CLAP) method for speech emotion recognition.
+Specifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing
+pre-trained WavLM and RoBERTa models. Second, given the significance of the
+gender attribute in speech emotion modeling, two novel soft label based
+GEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)
+models are further proposed to integrate emotion and gender information of
+speech signals, forming more reasonable objectives. Extensive experiments on
+IEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the
+baseline Emo-CLAP, while also achieving the best recognition performance
+compared with recent state-of-the-art methods. Noticeably, the proposed
+SL-GEmo-CLAP model achieves the best UAR of 81.43\% and WAR of 83.16\% which
+performs better than other state-of-the-art SER methods by at least 3\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NBIAS: A Natural Language Processing Framework for Bias Identification
+  in Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Muskan Garg, Deepak John Reji, Syed Raza Bashir, Chen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias in textual data can lead to skewed interpretations and outcomes when the
+data is used. These biases could perpetuate stereotypes, discrimination, or
+other forms of unfair treatment. An algorithm trained on biased data ends up
+making decisions that disproportionately impact a certain group of people.
+Therefore, it is crucial to detect and remove these biases to ensure the fair
+and ethical use of data. To this end, we develop a comprehensive and robust
+framework \textsc{Nbias} that consists of a data layer, corpus contruction,
+model development layer and an evaluation layer. The dataset is constructed by
+collecting diverse data from various fields, including social media,
+healthcare, and job hiring portals. As such, we applied a transformer-based
+token classification model that is able to identify bias words/ phrases through
+a unique named entity. In the assessment procedure, we incorporate a blend of
+quantitative and qualitative evaluations to gauge the effectiveness of our
+models. We achieve accuracy improvements ranging from 1% to 8% compared to
+baselines. We are also able to generate a robust understanding of the model
+functioning, capturing not only numerical data but also the quality and
+intricacies of its performance. The proposed approach is applicable to a
+variety of biases and contributes to the fair and ethical use of textual data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multiple References Era -- Addressing Data Leakage and Limited
+  Reference Diversity in NLG Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfeng Zeng, Yijin Liu, Fandong Meng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely
+utilized across a range of natural language generation (NLG) tasks. However,
+recent studies have revealed a weak correlation between these matching-based
+metrics and human evaluations, especially when compared with neural-based
+metrics like BLEURT. In this paper, we conjecture that the performance
+bottleneck in matching-based metrics may be caused by the limited diversity of
+references. To address this issue, we propose to utilize \textit{multiple
+references} to enhance the consistency between these metrics and human
+evaluations. Within the WMT Metrics benchmarks, we observe that the
+multi-references F200spBLEU surpasses the conventional single-reference one by
+an accuracy improvement of 7.2\%. Remarkably, it also exceeds the neural-based
+BERTscore by an accuracy enhancement of 3.9\%. Moreover, we observe that the
+data leakage issue in large language models (LLMs) can be mitigated to a large
+extent by our multi-reference metric. We release the code and data at
+\url{https://github.com/SefaZeng/LLM-Ref}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">114</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When More is Less: Incorporating Additional <span class="highlight-title">Dataset</span>s Can Hurt
+  Performance By Introducing Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rhys Compton, Lily Zhang, Aahlad Puli, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, incorporating more data is often seen as a reliable
+strategy for improving model performance; this work challenges that notion by
+demonstrating that the addition of external datasets in many cases can hurt the
+resulting model's performance. In a large-scale empirical study across
+combinations of four different open-source chest x-ray datasets and 9 different
+labels, we demonstrate that in 43% of settings, a model trained on data from
+two hospitals has poorer worst group accuracy over both hospitals than a model
+trained on just a single hospital's data. This surprising result occurs even
+though the added hospital makes the training distribution more similar to the
+test distribution. We explain that this phenomenon arises from the spurious
+correlation that emerges between the disease and hospital, due to
+hospital-specific image artifacts. We highlight the trade-off one encounters
+when training on multiple datasets, between the obvious benefit of additional
+data and insidious cost of the introduced spurious correlation. In some cases,
+balancing the dataset can remove the spurious correlation and improve
+performance, but it is not always an effective strategy. We contextualize our
+results within the literature on spurious correlations to help explain these
+outcomes. Our experiments underscore the importance of exercising caution when
+selecting training data for machine learning models, especially in settings
+where there is a risk of spurious correlations such as with medical imaging.
+The risks outlined highlight the need for careful data selection and model
+evaluation in future research and practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MLHC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep-Learning Method Using Auto-encoder and Generative Adversarial
+  Network for Anomaly Detection on Ancient Stone Stele Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikun Liu, Yuning Wang, Cheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection of natural deterioration and man-made damage on the
+surfaces of ancient stele in the first instance is essential for their
+preventive conservation. Existing methods for cultural heritage preservation
+are not able to achieve this goal perfectly due to the difficulty of balancing
+accuracy, efficiency, timeliness, and cost. This paper presents a deep-learning
+method to automatically detect above mentioned emergencies on ancient stone
+stele in real time, employing autoencoder (AE) and generative adversarial
+network (GAN). The proposed method overcomes the limitations of existing
+methods by requiring no extensive anomaly samples while enabling comprehensive
+detection of unpredictable anomalies. the method includes stages of monitoring,
+data acquisition, pre-processing, model structuring, and post-processing.
+Taking the Longmen Grottoes' stone steles as a case study, an unsupervised
+learning model based on AE and GAN architectures is proposed and validated with
+a reconstruction accuracy of 99.74\%. The method's evaluation revealed the
+proficient detection of seven artificially designed anomalies and demonstrated
+precision and reliability without false alarms. This research provides novel
+ideas and possibilities for the application of deep learning in the field of
+cultural heritage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from
+  Optical Satellite Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Yu Zhang, Shiying Wang, Lei Jin, Pin Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical satellite images are a critical data source; however, cloud cover
+often compromises their quality, hindering image applications and analysis.
+Consequently, effectively removing clouds from optical satellite images has
+emerged as a prominent research direction. While recent advancements in cloud
+removal primarily rely on generative adversarial networks, which may yield
+suboptimal image quality, diffusion models have demonstrated remarkable success
+in diverse image-generation tasks, showcasing their potential in addressing
+this challenge. This paper presents a novel framework called DiffCR, which
+leverages conditional guided diffusion with deep convolutional networks for
+high-performance cloud removal for optical satellite imagery. Specifically, we
+introduce a decoupled encoder for conditional image feature extraction,
+providing a robust color representation to ensure the close similarity of
+appearance information between the conditional input and the synthesized
+output. Moreover, we propose a novel and efficient time and condition fusion
+block within the cloud removal model to accurately simulate the correspondence
+between the appearance in the conditional image and the target image at a low
+computational cost. Extensive experimental evaluations on two commonly used
+benchmark datasets demonstrate that DiffCR consistently achieves
+state-of-the-art performance on all metrics, with parameter and computational
+complexities amounting to only 5.1% and 5.4%, respectively, of those previous
+best methods. The source code, pre-trained models, and all the experimental
+results will be publicly available at https://github.com/XavierJiezou/DiffCR
+upon the paper's acceptance of this work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Digging into Depth Priors for Outdoor Neural Radiance Fields <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Jiadai Sun, Lina Liu, Chenming Wu, Zhelun Shen, Dayan Wu, Yuchao Dai, Liangjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) have demonstrated impressive performance in
+vision and graphics tasks, such as novel view synthesis and immersive reality.
+However, the shape-radiance ambiguity of radiance fields remains a challenge,
+especially in the sparse viewpoints setting. Recent work resorts to integrating
+depth priors into outdoor NeRF training to alleviate the issue. However, the
+criteria for selecting depth priors and the relative merits of different priors
+have not been thoroughly investigated. Moreover, the relative merits of
+selecting different approaches to use the depth priors is also an unexplored
+problem. In this paper, we provide a comprehensive study and evaluation of
+employing depth priors to outdoor neural radiance fields, covering common depth
+sensing technologies and most application ways. Specifically, we conduct
+extensive experiments with two representative NeRF methods equipped with four
+commonly-used depth priors and different depth usages on two widely used
+outdoor datasets. Our experimental results reveal several interesting findings
+that can potentially benefit practitioners and researchers in training their
+NeRF models with depth priors. Project Page:
+https://cwchenwang.github.io/outdoor-nerf-depth
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023. Project Page:
+  https://cwchenwang.github.io/outdoor-nerf-depth</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ V-DETR: DETR with Vertex Relative Position Encoding for 3D Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Shen, Zigang Geng, Yuhui Yuan, Yutong Lin, Ze Liu, Chunyu Wang, Han Hu, Nanning Zheng, Baining Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a highly performant 3D object detector for point clouds using
+the DETR framework. The prior attempts all end up with suboptimal results
+because they fail to learn accurate inductive biases from the limited scale of
+training data. In particular, the queries often attend to points that are far
+away from the target objects, violating the locality principle in object
+detection. To address the limitation, we introduce a novel 3D Vertex Relative
+Position Encoding (3DV-RPE) method which computes position encoding for each
+point based on its relative position to the 3D boxes predicted by the queries
+in each decoder layer, thus providing clear information to guide the model to
+focus on points near the objects, in accordance with the principle of locality.
+In addition, we systematically improve the pipeline from various aspects such
+as data normalization based on our understanding of the task. We show
+exceptional results on the challenging ScanNetV2 benchmark, achieving
+significant improvements over the previous 3DETR in
+$\rm{AP}_{25}$/$\rm{AP}_{50}$ from 65.0\%/47.0\% to 77.8\%/66.0\%,
+respectively. In addition, our method sets a new record on ScanNetV2 and SUN
+RGB-D datasets.Code will be released at http://github.com/yichaoshen-MS/V-DETR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Person Re-Identification without Identification via Event Anonymization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafiq Ahmad, Pietro Morerio, Alessio Del Bue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide-scale use of visual surveillance in public spaces puts individual
+privacy at stake while increasing resource consumption (energy, bandwidth, and
+computation). Neuromorphic vision sensors (event-cameras) have been recently
+considered a valid solution to the privacy issue because they do not capture
+detailed RGB visual information of the subjects in the scene. However, recent
+deep learning architectures have been able to reconstruct images from event
+cameras with high fidelity, reintroducing a potential threat to privacy for
+event-based vision applications. In this paper, we aim to anonymize
+event-streams to protect the identity of human subjects against such image
+reconstruction attacks. To achieve this, we propose an end-to-end network
+architecture jointly optimized for the twofold objective of preserving privacy
+and performing a downstream task such as person ReId. Our network learns to
+scramble events, enforcing the degradation of images recovered from the privacy
+attacker. In this work, we also bring to the community the first ever
+event-based person ReId dataset gathered to evaluate the performance of our
+approach. We validate our approach with extensive experiments and report
+results on the synthetic event data simulated from the publicly available
+SoftBio dataset and our proposed Event-ReId dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LEFormer: A Hybrid CNN-<span class="highlight-title">Transformer</span> Architecture for Accurate Lake
+  Extraction from Remote Sensing Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Chen, Xuechao Zou, Yu Zhang, Jiayu Li, Kai Li, Pin Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lake extraction from remote sensing imagery is challenging due to the complex
+shapes of lakes and the presence of noise. Existing methods suffer from blurred
+segmentation boundaries and poor foreground modeling. In this paper, we propose
+a hybrid CNN-Transformer architecture, called LEFormer, for accurate lake
+extraction. LEFormer contains four main modules: CNN encoder, Transformer
+encoder, cross-encoder fusion, and lightweight decoder. The CNN encoder
+recovers local spatial information and improves fine-scale details.
+Simultaneously, the Transformer encoder captures long-range dependencies
+between sequences of any length, allowing them to obtain global features and
+context information better. Finally, a lightweight decoder is employed for mask
+prediction. We evaluate the performance and efficiency of LEFormer on two
+datasets, the Surface Water (SW) and the Qinghai-Tibet Plateau Lake (QTPL).
+Experimental results show that LEFormer consistently achieves state-of-the-art
+(SOTA) performance and efficiency on these two datasets, outperforming existing
+methods. Specifically, LEFormer achieves 90.86% and 97.42% mIoU on the SW and
+QTPL datasets with a parameter count of 3.61M, respectively, while being 20x
+minor than the previous SOTA method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation-Based Unsupervised Domain Adaptation In Medical
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Nørgaard Llambias, Mads Nielsen, Mostafa Mehdipour Ghazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based models in medical imaging often struggle to generalize
+effectively to new scans due to data heterogeneity arising from differences in
+hardware, acquisition parameters, population, and artifacts. This limitation
+presents a significant challenge in adopting machine learning models for
+clinical practice. We propose an unsupervised method for robust domain
+adaptation in brain MRI segmentation by leveraging MRI-specific augmentation
+techniques. To evaluate the effectiveness of our method, we conduct extensive
+experiments across diverse datasets, modalities, and segmentation tasks,
+comparing against the state-of-the-art methods. The results show that our
+proposed approach achieves high accuracy, exhibits broad applicability, and
+showcases remarkable robustness against domain shift in various tasks,
+surpassing the state-of-the-art performance in the majority of cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DELFlow: Dense Efficient Learning of Scene Flow for Large-Scale Point
+  Clouds <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04383v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04383v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chensheng Peng, Guangming Wang, Xian Wan Lo, Xinrui Wu, Chenfeng Xu, Masayoshi Tomizuka, Wei Zhan, Hesheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds are naturally sparse, while image pixels are dense. The
+inconsistency limits feature fusion from both modalities for point-wise scene
+flow estimation. Previous methods rarely predict scene flow from the entire
+point clouds of the scene with one-time inference due to the memory
+inefficiency and heavy overhead from distance calculation and sorting involved
+in commonly used farthest point sampling, KNN, and ball query algorithms for
+local feature aggregation. To mitigate these issues in scene flow learning, we
+regularize raw points to a dense format by storing 3D coordinates in 2D grids.
+Unlike the sampling operation commonly used in existing works, the dense 2D
+representation 1) preserves most points in the given scene, 2) brings in a
+significant boost of efficiency, and 3) eliminates the density gap between
+points and pixels, allowing us to perform effective feature fusion. We also
+present a novel warping projection technique to alleviate the information loss
+problem resulting from the fact that multiple points could be mapped into one
+grid during projection when computing cost volume. Sufficient experiments
+demonstrate the efficiency and effectiveness of our method, outperforming the
+prior-arts on the FlyingThings3D and KITTI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Codes will be released at
+  https://github.com/IRMVLab/DELFlow</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Your Negative May not Be True Negative: Boosting Image-Text Matching
+  with False Negative Elimination <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Yi Bin, Junrong Liao, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing image-text matching methods adopt triplet loss as the
+optimization objective, and choosing a proper negative sample for the triplet
+of <anchor, positive, negative> is important for effectively training the
+model, e.g., hard negatives make the model learn efficiently and effectively.
+However, we observe that existing methods mainly employ the most similar
+samples as hard negatives, which may not be true negatives. In other words, the
+samples with high similarity but not paired with the anchor may reserve
+positive semantic associations, and we call them false negatives. Repelling
+these false negatives in triplet loss would mislead the semantic representation
+learning and result in inferior retrieval performance. In this paper, we
+propose a novel False Negative Elimination (FNE) strategy to select negatives
+via sampling, which could alleviate the problem introduced by false negatives.
+Specifically, we first construct the distributions of positive and negative
+samples separately via their similarities with the anchor, based on the
+features extracted from image and text encoders. Then we calculate the false
+negative probability of a given sample based on its similarity with the anchor
+and the above distributions via the Bayes' rule, which is employed as the
+sampling weight during negative sampling process. Since there may not exist any
+false negative in a small batch size, we design a memory module with momentum
+to retain a large negative buffer and implement our negative sampling strategy
+spanning over the buffer. In addition, to make the model focus on hard
+negatives, we reassign the sampling weights for the simple negatives with a
+cut-down strategy. The extensive experiments are conducted on Flickr30K and
+MS-COCO, and the results demonstrate the superiority of our proposed false
+negative elimination strategy. The code is available at
+https://github.com/LuminosityX/FNE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pelta: Shielding <span class="highlight-title">Transformer</span>s to Mitigate Evasion Attacks in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Queyrut, Yérom-David Bromberg, Valerio Schiavoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main premise of federated learning is that machine learning model updates
+are computed locally, in particular to preserve user data privacy, as those
+never leave the perimeter of their device. This mechanism supposes the general
+model, once aggregated, to be broadcast to collaborating and non malicious
+nodes. However, without proper defenses, compromised clients can easily probe
+the model inside their local memory in search of adversarial examples. For
+instance, considering image-based applications, adversarial examples consist of
+imperceptibly perturbed images (to the human eye) misclassified by the local
+model, which can be later presented to a victim node's counterpart model to
+replicate the attack. To mitigate such malicious probing, we introduce Pelta, a
+novel shielding mechanism leveraging trusted hardware. By harnessing the
+capabilities of Trusted Execution Environments (TEEs), Pelta masks part of the
+back-propagation chain rule, otherwise typically exploited by attackers for the
+design of malicious samples. We evaluate Pelta on a state of the art ensemble
+model and demonstrate its effectiveness against the Self Attention Gradient
+adversarial Attack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Super-Resolution Meets Camouflaged Object Detection: A Comparison
+  Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Wen, Shupeng Cheng, Peng Xu, Bowen Zhou, Radu Timofte, Weiyan Hou, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Super Resolution (SR) and Camouflaged Object Detection (COD) are two hot
+topics in computer vision with various joint applications. For instance,
+low-resolution surveillance images can be successively processed by
+super-resolution techniques and camouflaged object detection. However, in
+previous work, these two areas are always studied in isolation. In this paper,
+we, for the first time, conduct an integrated comparative evaluation for both.
+Specifically, we benchmark different super-resolution methods on commonly used
+COD datasets, and meanwhile, we evaluate the robustness of different COD models
+by using COD data processed by SR methods. Our goal is to bridge these two
+domains, discover novel experimental phenomena, summarize new experim.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages with 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSTFormer: Bridging Spiking Neural Network and Memory Support
+  <span class="highlight-title">Transformer</span> for Frame-Event based Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Zongzhen Wu, Yao Rong, Lin Zhu, Bo Jiang, Jin Tang, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event camera-based pattern recognition is a newly arising research topic in
+recent years. Current researchers usually transform the event streams into
+images, graphs, or voxels, and adopt deep neural networks for event-based
+classification. Although good performance can be achieved on simple event
+recognition datasets, however, their results may be still limited due to the
+following two issues. Firstly, they adopt spatial sparse event streams for
+recognition only, which may fail to capture the color and detailed texture
+information well. Secondly, they adopt either Spiking Neural Networks (SNN) for
+energy-efficient recognition with suboptimal results, or Artificial Neural
+Networks (ANN) for energy-intensive, high-performance recognition. However,
+seldom of them consider achieving a balance between these two aspects. In this
+paper, we formally propose to recognize patterns by fusing RGB frames and event
+streams simultaneously and propose a new RGB frame-event recognition framework
+to address the aforementioned issues. The proposed method contains four main
+modules, i.e., memory support Transformer network for RGB frame encoding,
+spiking neural network for raw event stream encoding, multi-modal bottleneck
+fusion module for RGB-Event feature aggregation, and prediction head. Due to
+the scarce of RGB-Event based classification dataset, we also propose a
+large-scale PokerEvent dataset which contains 114 classes, and 27102
+frame-event pairs recorded using a DVS346 event camera. Extensive experiments
+on two RGB-Event based classification datasets fully validated the
+effectiveness of our proposed framework. We hope this work will boost the
+development of pattern recognition by fusing RGB frames and event streams. Both
+our dataset and source code of this work will be released at
+https://github.com/Event-AHU/SSTFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Peer Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Unbiased Image Segmentation: A Case Study with Plain Knee
+  Radiographs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nickolas Littlefield, Johannes F. Plate, Kurt R. Weiss, Ines Lohse, Avani Chhabra, Ismaeel A. Siddiqui, Zoe Menezes, George Mastorakos, Sakshi Mehul Thakar, Mehrnaz Abedian, Matthew F. Gong, Luke A. Carlson, Hamidreza Moradi, Soheyla Amirian, Ahmad P. Tafti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic segmentation of knee bony anatomy is essential in orthopedics, and
+it has been around for several years in both pre-operative and post-operative
+settings. While deep learning algorithms have demonstrated exceptional
+performance in medical image analysis, the assessment of fairness and potential
+biases within these models remains limited. This study aims to revisit deep
+learning-powered knee-bony anatomy segmentation using plain radiographs to
+uncover visible gender and racial biases. The current contribution offers the
+potential to advance our understanding of biases, and it provides practical
+insights for researchers and practitioners in medical imaging. The proposed
+mitigation strategies mitigate gender and racial biases, ensuring fair and
+unbiased segmentation results. Furthermore, this work promotes equal access to
+accurate diagnoses and treatment outcomes for diverse patient populations,
+fostering equitable and inclusive healthcare provision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE BHI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-VisTA: <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span> for 3D Vision and Text Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyu Zhu, Xiaojian Ma, Yixin Chen, Zhidong Deng, Siyuan Huang, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D vision-language grounding (3D-VL) is an emerging field that aims to
+connect the 3D physical world with natural language, which is crucial for
+achieving embodied intelligence. Current 3D-VL models rely heavily on
+sophisticated modules, auxiliary losses, and optimization tricks, which calls
+for a simple and unified model. In this paper, we propose 3D-VisTA, a
+pre-trained Transformer for 3D Vision and Text Alignment that can be easily
+adapted to various downstream tasks. 3D-VisTA simply utilizes self-attention
+layers for both single-modal modeling and multi-modal fusion without any
+sophisticated task-specific design. To further enhance its performance on 3D-VL
+tasks, we construct ScanScribe, the first large-scale 3D scene-text pairs
+dataset for 3D-VL pre-training. ScanScribe contains 2,995 RGB-D scans for 1,185
+unique indoor scenes originating from ScanNet and 3R-Scan datasets, along with
+paired 278K scene descriptions generated from existing 3D-VL tasks, templates,
+and GPT-3. 3D-VisTA is pre-trained on ScanScribe via masked language/object
+modeling and scene-text matching. It achieves state-of-the-art results on
+various 3D-VL tasks, ranging from visual grounding and dense captioning to
+question answering and situated reasoning. Moreover, 3D-VisTA demonstrates
+superior data efficiency, obtaining strong performance even with limited
+annotations during downstream task fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying Two-Stream Encoders with <span class="highlight-title">Transformer</span>s for Cross-Modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Bin, Haoxuan Li, Yahui Xu, Xing Xu, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing cross-modal retrieval methods employ two-stream encoders with
+different architectures for images and texts, \textit{e.g.}, CNN for images and
+RNN/Transformer for texts. Such discrepancy in architectures may induce
+different semantic distribution spaces and limit the interactions between
+images and texts, and further result in inferior alignment between images and
+texts. To fill this research gap, inspired by recent advances of Transformers
+in vision tasks, we propose to unify the encoder architectures with
+Transformers for both modalities. Specifically, we design a cross-modal
+retrieval framework purely based on two-stream Transformers, dubbed
+\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image
+Transformer, a text Transformer, and a hierarchical alignment module. With such
+identical architectures, the encoders could produce representations with more
+similar characteristics for images and texts, and make the interactions and
+alignments between them much easier. Besides, to leverage the rich semantics,
+we devise a hierarchical alignment scheme to explore multi-level
+correspondences of different layers between images and texts. To evaluate the
+effectiveness of the proposed HAT, we conduct extensive experiments on two
+benchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that
+HAT outperforms SOTA baselines by a large margin. Specifically, on two key
+tasks, \textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves
+7.6\% and 16.7\% relative score improvement of Recall@1 on MSCOCO, and 4.4\%
+and 11.6\% on Flickr30k respectively. The code is available at
+\url{https://github.com/LuminosityX/HAT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight and Accurate Face Detection Algorithm Based on Retinaface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baozhu Liu, Hewei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a lightweight and accurate face detection algorithm
+LAFD (Light and accurate face detection) based on Retinaface. Backbone network
+in the algorithm is a modified MobileNetV3 network which adjusts the size of
+the convolution kernel, the channel expansion multiplier of the inverted
+residuals block and the use of the SE attention mechanism. Deformable
+convolution network(DCN) is introduced in the context module and the algorithm
+uses focal loss function instead of cross-entropy loss function as the
+classification loss function of the model. The test results on the WIDERFACE
+dataset indicate that the average accuracy of LAFD is 94.1%, 92.2% and 82.1%
+for the "easy", "medium" and "hard" validation subsets respectively with an
+improvement of 3.4%, 4.0% and 8.3% compared to Retinaface and 3.1%, 4.1% and
+4.1% higher than the well-performing lightweight model, LFFD. If the input
+image is pre-processed and scaled to 1560px in length or 1200px in width, the
+model achieves an average accuracy of 86.2% on the 'hard' validation subset.
+The model is lightweight, with a size of only 10.2MB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pengembangan Model untuk Mendeteksi Kerusakan pada Terumbu Karang dengan
+  Klasifikasi Citra 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fadhil Muhammad, Alif Bintang Elfandra, Iqbal Pahlevi Amin, Alfan Farizki Wicaksono
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The abundant biodiversity of coral reefs in Indonesian waters is a valuable
+asset that needs to be preserved. Rapid climate change and uncontrolled human
+activities have led to the degradation of coral reef ecosystems, including
+coral bleaching, which is a critical indicator of coral health conditions.
+Therefore, this research aims to develop an accurate classification model to
+distinguish between healthy corals and corals experiencing bleaching. This
+study utilizes a specialized dataset consisting of 923 images collected from
+Flickr using the Flickr API. The dataset comprises two distinct classes:
+healthy corals (438 images) and bleached corals (485 images). These images have
+been resized to a maximum of 300 pixels in width or height, whichever is
+larger, to maintain consistent sizes across the dataset.
+  The method employed in this research involves the use of machine learning
+models, particularly convolutional neural networks (CNN), to recognize and
+differentiate visual patterns associated with healthy and bleached corals. In
+this context, the dataset can be used to train and test various classification
+models to achieve optimal results. By leveraging the ResNet model, it was found
+that a from-scratch ResNet model can outperform pretrained models in terms of
+precision and accuracy. The success in developing accurate classification
+models will greatly benefit researchers and marine biologists in gaining a
+better understanding of coral reef health. These models can also be employed to
+monitor changes in the coral reef environment, thereby making a significant
+contribution to conservation and ecosystem restoration efforts that have
+far-reaching impacts on life.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Indonesian language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Adaptive Person Search via GAN-based Scene Synthesis for
+  Cross-scene Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huibing Wang, Tianxiang Cui, Mingze Yao, Huijuan Pang, Yushan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person search has recently been a challenging task in the computer vision
+domain, which aims to search specific pedestrians from real
+cameras.Nevertheless, most surveillance videos comprise only a handful of
+images of each pedestrian, which often feature identical backgrounds and
+clothing. Hence, it is difficult to learn more discriminative features for
+person search in real scenes. To tackle this challenge, we draw on Generative
+Adversarial Networks (GAN) to synthesize data from surveillance videos. GAN has
+thrived in computer vision problems because it produces high-quality images
+efficiently. We merely alter the popular Fast R-CNN model, which is capable of
+processing videos and yielding accurate detection outcomes. In order to
+appropriately relieve the pressure brought by the two-stage model, we design an
+Assisted-Identity Query Module (AIDQ) to provide positive images for the behind
+part. Besides, the proposed novel GAN-based Scene Synthesis model that can
+synthesize high-quality cross-id person images for person search tasks. In
+order to facilitate the feature learning of the GAN-based Scene Synthesis
+model, we adopt an online learning strategy that collaboratively learns the
+synthesized images and original images. Extensive experiments on two widely
+used person search benchmarks, CUHK-SYSU and PRW, have shown that our method
+has achieved great performance, and the extensive ablation study further
+justifies our GAN-synthetic data can effectively increase the variability of
+the datasets and be more realistic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All-pairs Consistency Learning for Weakly Supervised Semantic
+  Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixuan Sun, Yanhao Zhang, Zhen Qin, Zheyuan Liu, Lin Cheng, Fanyi Wang, Yiran Zhong, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a new transformer-based regularization to better
+localize objects for Weakly supervised semantic segmentation (WSSS). In
+image-level WSSS, Class Activation Map (CAM) is adopted to generate object
+localization as pseudo segmentation labels. To address the partial activation
+issue of the CAMs, consistency regularization is employed to maintain
+activation intensity invariance across various image augmentations. However,
+such methods ignore pair-wise relations among regions within each CAM, which
+capture context and should also be invariant across image views. To this end,
+we propose a new all-pairs consistency regularization (ACR). Given a pair of
+augmented views, our approach regularizes the activation intensities between a
+pair of augmented views, while also ensuring that the affinity across regions
+within each view remains consistent. We adopt vision transformers as the
+self-attention mechanism naturally embeds pair-wise affinity. This enables us
+to simply regularize the distance between the attention matrices of augmented
+image pairs. Additionally, we introduce a novel class-wise localization method
+that leverages the gradients of the class token. Our method can be seamlessly
+integrated into existing WSSS methods using transformers without modifying the
+architectures. We evaluate our method on PASCAL VOC and MS COCO datasets. Our
+method produces noticeably better class localization maps (67.3% mIoU on PASCAL
+VOC train), resulting in superior WSSS performances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vehicle Motion Forecasting using Prior Information and Semantic-assisted
+  Occupancy Grid Maps <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rabbia Asghar, Manuel Diaz-Zapata, Lukas Rummelhard, Anne Spalanzani, Christian Laugier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion prediction is a challenging task for autonomous vehicles due to
+uncertainty in the sensor data, the non-deterministic nature of future, and
+complex behavior of agents. In this paper, we tackle this problem by
+representing the scene as dynamic occupancy grid maps (DOGMs), associating
+semantic labels to the occupied cells and incorporating map information. We
+propose a novel framework that combines deep-learning-based spatio-temporal and
+probabilistic approaches to predict vehicle behaviors.Contrary to the
+conventional OGM prediction methods, evaluation of our work is conducted
+against the ground truth annotations. We experiment and validate our results on
+real-world NuScenes dataset and show that our model shows superior ability to
+predict both static and dynamic vehicles compared to OGM predictions.
+Furthermore, we perform an ablation study and assess the role of semantic
+labels and map in the architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 2023 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cloth2Tex: A Customized Cloth Texture Generation Pipeline for 3D Virtual
+  Try-On 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daiheng Gao, Xu Chen, Xindi Zhang, Qi Wang, Ke Sun, Bang Zhang, Liefeng Bo, Qixing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fabricating and designing 3D garments has become extremely demanding with the
+increasing need for synthesizing realistic dressed persons for a variety of
+applications, e.g. 3D virtual try-on, digitalization of 2D clothes into 3D
+apparel, and cloth animation. It thus necessitates a simple and straightforward
+pipeline to obtain high-quality texture from simple input, such as 2D reference
+images. Since traditional warping-based texture generation methods require a
+significant number of control points to be manually selected for each type of
+garment, which can be a time-consuming and tedious process. We propose a novel
+method, called Cloth2Tex, which eliminates the human burden in this process.
+Cloth2Tex is a self-supervised method that generates texture maps with
+reasonable layout and structural consistency. Another key feature of Cloth2Tex
+is that it can be used to support high-fidelity texture inpainting. This is
+done by combining Cloth2Tex with a prevailing latent diffusion model. We
+evaluate our approach both qualitatively and quantitatively and demonstrate
+that Cloth2Tex can generate high-quality texture maps and achieve the best
+visual effects in comparison to other methods. Project page:
+tomguluson92.github.io/projects/cloth2tex/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Based Autonomous Navigation for Unmanned Surface Vessel in
+  Extreme Marine Conditions <span class="chip">IROS-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhayyuddin Ahmed, Ahsan Baidar Bakht, Taimur Hassan, Waseem Akram, Ahmed Humais, Lakmal Seneviratne, Shaoming He, Defu Lin, Irfan Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual perception is an important component for autonomous navigation of
+unmanned surface vessels (USV), particularly for the tasks related to
+autonomous inspection and tracking. These tasks involve vision-based navigation
+techniques to identify the target for navigation. Reduced visibility under
+extreme weather conditions in marine environments makes it difficult for
+vision-based approaches to work properly. To overcome these issues, this paper
+presents an autonomous vision-based navigation framework for tracking target
+objects in extreme marine conditions. The proposed framework consists of an
+integrated perception pipeline that uses a generative adversarial network (GAN)
+to remove noise and highlight the object features before passing them to the
+object detector (i.e., YOLOv5). The detected visual features are then used by
+the USV to track the target. The proposed framework has been thoroughly tested
+in simulation under extremely reduced visibility due to sandstorms and fog. The
+results are compared with state-of-the-art de-hazing methods across the
+benchmarked MBZIRC simulation dataset, on which the proposed scheme has
+outperformed the existing methods across various metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots (IROS-2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lossy and Lossless (L$^2$) Post-training Model Size Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng Shi, Shihao Bai, Xiuying Wei, Ruihao Gong, Jianlei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have delivered remarkable performance and have been
+widely used in various visual tasks. However, their huge size causes
+significant inconvenience for transmission and storage. Many previous studies
+have explored model size compression. However, these studies often approach
+various lossy and lossless compression methods in isolation, leading to
+challenges in achieving high compression ratios efficiently. This work proposes
+a post-training model size compression method that combines lossy and lossless
+compression in a unified way. We first propose a unified parametric weight
+transformation, which ensures different lossy compression methods can be
+performed jointly in a post-training manner. Then, a dedicated differentiable
+counter is introduced to guide the optimization of lossy compression to arrive
+at a more suitable point for later lossless compression. Additionally, our
+method can easily control a desired global compression ratio and allocate
+adaptive ratios for different layers. Finally, our method can achieve a stable
+$10\times$ compression ratio without sacrificing accuracy and a $20\times$
+compression ratio with minor accuracy loss in a short time. Our code is
+available at https://github.com/ModelTC/L2_Compression .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SDLFormer: A Sparse and Dense Locality-enhanced <span class="highlight-title">Transformer</span> for
+  Accelerated MR Image Reconstruction <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul G. S., Sriprabha Ramnarayanan, Mohammad Al Fahim, Keerthi Ram, Preejith S. P, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as viable alternatives to convolutional neural
+networks owing to their ability to learn non-local region relationships in the
+spatial domain. The self-attention mechanism of the transformer enables
+transformers to capture long-range dependencies in the images, which might be
+desirable for accelerated MRI image reconstruction as the effect of
+undersampling is non-local in the image domain. Despite its computational
+efficiency, the window-based transformers suffer from restricted receptive
+fields as the dependencies are limited to within the scope of the image
+windows. We propose a window-based transformer network that integrates dilated
+attention mechanism and convolution for accelerated MRI image reconstruction.
+The proposed network consists of dilated and dense neighborhood attention
+transformers to enhance the distant neighborhood pixel relationship and
+introduce depth-wise convolutions within the transformer module to learn
+low-level translation invariant features for accelerated MRI image
+reconstruction. The proposed model is trained in a self-supervised manner. We
+perform extensive experiments for multi-coil MRI acceleration for coronal PD,
+coronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in
+self-supervised learning based on k-space splitting. We compare our method
+against other reconstruction architectures and the parallel domain
+self-supervised learning baseline. Results show that the proposed model
+exhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in
+SSIM on average over other architectures (ii) around 1.44 dB in PSNR and around
+0.029 in SSIM over parallel domain self-supervised learning. The code is
+available at https://github.com/rahul-gs-16/sdlformer.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with
+  noisy and Limited Data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Blur aware metric depth estimation with multi-focus plenoptic cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Labussière, Céline Teulière, Omar Ait-Aider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a traditional camera only captures one point of view of a scene, a
+plenoptic or light-field camera, is able to capture spatial and angular
+information in a single snapshot, enabling depth estimation from a single
+acquisition. In this paper, we present a new metric depth estimation algorithm
+using only raw images from a multi-focus plenoptic camera. The proposed
+approach is especially suited for the multi-focus configuration where several
+micro-lenses with different focal lengths are used. The main goal of our blur
+aware depth estimation (BLADE) approach is to improve disparity estimation for
+defocus stereo images by integrating both correspondence and defocus cues. We
+thus leverage blur information where it was previously considered a drawback.
+We explicitly derive an inverse projection model including the defocus blur
+providing depth estimates up to a scale factor. A method to calibrate the
+inverse model is then proposed. We thus take into account depth scaling to
+achieve precise and accurate metric depth estimates. Our results show that
+introducing defocus cues improves the depth estimation. We demonstrate the
+effectiveness of our framework and depth scaling calibration on relative depth
+estimation setups and on real-world 3D complex scenes with ground truth
+acquired with a 3D lidar scanner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 12 Figures, 3 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MindDiffuser: Controlled Image Reconstruction from Human Brain Activity
+  with Semantic and Structural Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhuo Lu, Changde Du, Qiongyi zhou, Dianpeng Wang, Huiguang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing visual stimuli from brain recordings has been a meaningful and
+challenging task. Especially, the achievement of precise and controllable image
+reconstruction bears great significance in propelling the progress and
+utilization of brain-computer interfaces. Despite the advancements in complex
+image reconstruction techniques, the challenge persists in achieving a cohesive
+alignment of both semantic (concepts and objects) and structure (position,
+orientation, and size) with the image stimuli. To address the aforementioned
+issue, we propose a two-stage image reconstruction model called MindDiffuser.
+In Stage 1, the VQ-VAE latent representations and the CLIP text embeddings
+decoded from fMRI are put into Stable Diffusion, which yields a preliminary
+image that contains semantic information. In Stage 2, we utilize the CLIP
+visual feature decoded from fMRI as supervisory information, and continually
+adjust the two feature vectors decoded in Stage 1 through backpropagation to
+align the structural information. The results of both qualitative and
+quantitative analyses demonstrate that our model has surpassed the current
+state-of-the-art models on Natural Scenes Dataset (NSD). The subsequent
+experimental findings corroborate the neurobiological plausibility of the
+model, as evidenced by the interpretability of the multimodal feature employed,
+which align with the corresponding brain responses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2303.14139</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AICSD: Adaptive Inter-Class Similarity Distillation for Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir M. Mansourian, Rozhan Ahmadi, Shohreh Kasaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep neural networks have achieved remarkable accuracy in
+computer vision tasks. With inference time being a crucial factor, particularly
+in dense prediction tasks such as semantic segmentation, knowledge distillation
+has emerged as a successful technique for improving the accuracy of lightweight
+student networks. The existing methods often neglect the information in
+channels and among different classes. To overcome these limitations, this paper
+proposes a novel method called Inter-Class Similarity Distillation (ICSD) for
+the purpose of knowledge distillation. The proposed method transfers high-order
+relations from the teacher network to the student network by independently
+computing intra-class distributions for each class from network outputs. This
+is followed by calculating inter-class similarity matrices for distillation
+using KL divergence between distributions of each pair of classes. To further
+improve the effectiveness of the proposed method, an Adaptive Loss Weighting
+(ALW) training strategy is proposed. Unlike existing methods, the ALW strategy
+gradually reduces the influence of the teacher network towards the end of
+training process to account for errors in teacher's predictions. Extensive
+experiments conducted on two well-known datasets for semantic segmentation,
+Cityscapes and Pascal VOC 2012, validate the effectiveness of the proposed
+method in terms of mIoU and pixel accuracy. The proposed method outperforms
+most of existing knowledge distillation methods as demonstrated by both
+quantitative and qualitative evaluations. Code is available at:
+https://github.com/AmirMansurian/AICSD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Image-to-Image Translation Using GANs for
+  Synthetic Child Race Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Yao, Muhammad Ali Farooq, Joseph Lemley, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of ethnic diversity in data has been a limiting factor of face
+recognition techniques in the literature. This is particularly the case for
+children where data samples are scarce and presents a challenge when seeking to
+adapt machine vision algorithms that are trained on adult data to work on
+children. This work proposes the utilization of image-to-image transformation
+to synthesize data of different races and thus adjust the ethnicity of
+children's face data. We consider ethnicity as a style and compare three
+different Image-to-Image neural network based methods, specifically pix2pix,
+CycleGAN, and CUT networks to implement Caucasian child data and Asian child
+data conversion. Experimental validation results on synthetic data demonstrate
+the feasibility of using image-to-image transformation methods to generate
+various synthetic child data samples with broader ethnic diversity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Will your Doorbell Camera still recognize you as you grow old 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Yao, Muhammad Ali Farooq, Joseph Lemley, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust authentication for low-power consumer devices such as doorbell cameras
+poses a valuable and unique challenge. This work explores the effect of age and
+aging on the performance of facial authentication methods. Two public age
+datasets, AgeDB and Morph-II have been used as baselines in this work. A
+photo-realistic age transformation method has been employed to augment a set of
+high-quality facial images with various age effects. Then the effect of these
+synthetic aging data on the high-performance deep-learning-based face
+recognition model is quantified by using various metrics including Receiver
+Operating Characteristic (ROC) curves and match score distributions.
+Experimental results demonstrate that long-term age effects are still a
+significant challenge for the state-of-the-art facial authentication method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AquaSAM: Underwater Image Foreground Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muduo Xu, Jianhao Su, Yutao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Segment Anything Model (SAM) has revolutionized natural image
+segmentation, nevertheless, its performance on underwater images is still
+restricted. This work presents AquaSAM, the first attempt to extend the success
+of SAM on underwater images with the purpose of creating a versatile method for
+the segmentation of various underwater targets. To achieve this, we begin by
+classifying and extracting various labels automatically in SUIM dataset.
+Subsequently, we develop a straightforward fine-tuning method to adapt SAM to
+general foreground underwater image segmentation. Through extensive experiments
+involving eight segmentation tasks like human divers, we demonstrate that
+AquaSAM outperforms the default SAM model especially at hard tasks like coral
+reefs. AquaSAM achieves an average Dice Similarity Coefficient (DSC) of 7.13
+(%) improvement and an average of 8.27 (%) on mIoU improvement in underwater
+segmentation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust retrieval of material chemical states in X-ray microspectroscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting Wang, Xiaotong Wu, Jizhou Li, Chao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  X-ray microspectroscopic techniques are essential for studying morphological
+and chemical changes in materials, providing high-resolution structural and
+spectroscopic information. However, its practical data analysis for reliably
+retrieving the chemical states remains a major obstacle to accelerating the
+fundamental understanding of materials in many research fields. In this work,
+we propose a novel data formulation model for X-ray microspectroscopy and
+develop a dedicated unmixing framework to solve this problem, which is robust
+to noise and spectral variability. Moreover, this framework is not limited to
+the analysis of two-state material chemistry, making it an effective
+alternative to conventional and widely-used methods. In addition, an
+alternative directional multiplier method with provable convergence is applied
+to obtain the solution efficiently. Our framework can accurately identify and
+characterize chemical states in complex and heterogeneous samples, even under
+challenging conditions such as low signal-to-noise ratios and overlapping
+spectral features. Extensive experimental results on simulated and real
+datasets demonstrate its effectiveness and reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring <span class="highlight-title">Transformer</span>s for Open-world Instance Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiannan Wu, Yi Jiang, Bin Yan, Huchuan Lu, Zehuan Yuan, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-world instance segmentation is a rising task, which aims to segment all
+objects in the image by learning from a limited number of base-category
+objects. This task is challenging, as the number of unseen categories could be
+hundreds of times larger than that of seen categories. Recently, the DETR-like
+models have been extensively studied in the closed world while stay unexplored
+in the open world. In this paper, we utilize the Transformer for open-world
+instance segmentation and present SWORD. Firstly, we introduce to attach the
+stop-gradient operation before classification head and further add IoU heads
+for discovering novel objects. We demonstrate that a simple stop-gradient
+operation not only prevents the novel objects from being suppressed as
+background, but also allows the network to enjoy the merit of heuristic label
+assignment. Secondly, we propose a novel contrastive learning framework to
+enlarge the representations between objects and background. Specifically, we
+maintain a universal object queue to obtain the object center, and dynamically
+select positive and negative samples from the object queries for contrastive
+learning. While the previous works only focus on pursuing average recall and
+neglect average precision, we show the prominence of SWORD by giving
+consideration to both criteria. Our models achieve state-of-the-art performance
+in various open-world cross-category and cross-dataset generalizations.
+Particularly, in VOC to non-VOC setup, our method sets new state-of-the-art
+results of 40.0% on ARb100 and 34.9% on ARm100. For COCO to UVO generalization,
+SWORD significantly outperforms the previous best open-world model by 5.9% on
+APm and 8.1% on ARm100.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. 16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with
+  Glance Annotation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjun Li, Xiujun Shu, Sunan He, Ruizhi Qiao, Wei Wen, Taian Guo, Bei Gan, Xing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal sentence grounding (TSG) aims to locate a specific moment from an
+untrimmed video with a given natural language query. Recently, weakly
+supervised methods still have a large performance gap compared to fully
+supervised ones, while the latter requires laborious timestamp annotations. In
+this study, we aim to reduce the annotation cost yet keep competitive
+performance for TSG task compared to fully supervised ones. To achieve this
+goal, we investigate a recently proposed glance-supervised temporal sentence
+grounding task, which requires only single frame annotation (referred to as
+glance annotation) for each query. Under this setup, we propose a Dynamic
+Gaussian prior based Grounding framework with Glance annotation (D3G), which
+consists of a Semantic Alignment Group Contrastive Learning module (SA-GCL) and
+a Dynamic Gaussian prior Adjustment module (DGA). Specifically, SA-GCL samples
+reliable positive moments from a 2D temporal map via jointly leveraging
+Gaussian prior and semantic consistency, which contributes to aligning the
+positive sentence-moment pairs in the joint embedding space. Moreover, to
+alleviate the annotation bias resulting from glance annotation and model
+complex queries consisting of multiple events, we propose the DGA module, which
+adjusts the distribution dynamically to approximate the ground truth of target
+moments. Extensive experiments on three challenging benchmarks verify the
+effectiveness of the proposed D3G. It outperforms the state-of-the-art weakly
+supervised methods by a large margin and narrows the performance gap compared
+to fully supervised methods. Code is available at
+https://github.com/solicucu/D3G.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Copy-Move Forgery Detection via Deep Cross-Scale PatchMatch <span class="chip">ICME2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingjie He, Yuanman Li, Changsheng Chen, Xia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently developed deep algorithms achieve promising progress in the
+field of image copy-move forgery detection (CMFD). However, they have limited
+generalizability in some practical scenarios, where the copy-move objects may
+not appear in the training images or cloned regions are from the background. To
+address the above issues, in this work, we propose a novel end-to-end CMFD
+framework by integrating merits from both conventional and deep methods.
+Specifically, we design a deep cross-scale patchmatch method tailored for CMFD
+to localize copy-move regions. In contrast to existing deep models, our scheme
+aims to seek explicit and reliable point-to-point matching between source and
+target regions using features extracted from high-resolution scales. Further,
+we develop a manipulation region location branch for source/target separation.
+The proposed CMFD framework is completely differentiable and can be trained in
+an end-to-end manner. Extensive experimental results demonstrate the high
+generalizability of our method to different copy-move contents, and the
+proposed scheme achieves significantly better performance than existing
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, accepted by ICME2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Generalizable are Deepfake Detectors? An Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boquan Li, Jun Sun, Christopher M. Poskitt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfake videos and images are becoming increasingly credible, posing a
+significant threat given their potential to facilitate fraud or bypass access
+control systems. This has motivated the development of deepfake detection
+methods, in which deep learning models are trained to distinguish between real
+and synthesized footage. Unfortunately, existing detection models struggle to
+generalize to deepfakes from datasets they were not trained on, but little work
+has been done to examine why or how this limitation can be addressed. In this
+paper, we present the first empirical study on the generalizability of deepfake
+detectors, an essential goal for detectors to stay one step ahead of attackers.
+Our study utilizes six deepfake datasets, five deepfake detection methods, and
+two model augmentation approaches, confirming that detectors do not generalize
+in zero-shot settings. Additionally, we find that detectors are learning
+unwanted properties specific to synthesis methods and struggling to extract
+discriminative features, limiting their ability to generalize. Finally, we find
+that there are neurons universally contributing to detection across seen and
+unseen datasets, illuminating a possible path forward to zero-shot
+generalizability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EFaR 2023: Efficient Face Recognition Competition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Niklas Kolf, Fadi Boutros, Jurek Elliesen, Markus Theuerkauf, Naser Damer, Mohamad Alansari, Oussama Abdul Hay, Sara Alansari, Sajid Javed, Naoufel Werghi, Klemen Grm, Vitomir Štruc, Fernando Alonso-Fernandez, Kevin Hernandez Diaz, Josef Bigun, Anjith George, Christophe Ecabert, Hatef Otroshi Shahreza, Ketan Kotwal, Sébastien Marcel, Iurii Medvedev, Bo Jin, Diogo Nunes, Ahmad Hassanpour, Pankaj Khatiwada, Aafan Ahmad Toor, Bian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the summary of the Efficient Face Recognition Competition
+(EFaR) held at the 2023 International Joint Conference on Biometrics (IJCB
+2023). The competition received 17 submissions from 6 different teams. To drive
+further development of efficient face recognition models, the submitted
+solutions are ranked based on a weighted score of the achieved verification
+accuracies on a diverse set of benchmarks, as well as the deployability given
+by the number of floating-point operations and model size. The evaluation of
+submissions is extended to bias, cross-quality, and large-scale recognition
+benchmarks. Overall, the paper gives an overview of the achieved performance
+values of the submitted solutions as well as a diverse set of baselines. The
+submitted solutions use small, efficient network architectures to reduce the
+computational cost, some solutions apply model quantization. An outlook on
+possible techniques that are underrepresented in current solutions is given as
+well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Under-Display Camera Image Restoration with Scattering Effect <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binbin Song, Xiangyu Chen, Shuning Xu, Jiantao Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The under-display camera (UDC) provides consumers with a full-screen visual
+experience without any obstruction due to notches or punched holes. However,
+the semi-transparent nature of the display inevitably introduces the severe
+degradation into UDC images. In this work, we address the UDC image restoration
+problem with the specific consideration of the scattering effect caused by the
+display. We explicitly model the scattering effect by treating the display as a
+piece of homogeneous scattering medium. With the physical model of the
+scattering effect, we improve the image formation pipeline for the image
+synthesis to construct a realistic UDC dataset with ground truths. To suppress
+the scattering effect for the eventual UDC image recovery, a two-branch
+restoration network is designed. More specifically, the scattering branch
+leverages global modeling capabilities of the channel-wise self-attention to
+estimate parameters of the scattering effect from degraded images. While the
+image branch exploits the local representation advantage of CNN to recover
+clear scenes, implicitly guided by the scattering branch. Extensive experiments
+are conducted on both real-world and synthesized data, demonstrating the
+superiority of the proposed method over the state-of-the-art UDC restoration
+techniques. The source code and dataset are available at
+\url{https://github.com/NamecantbeNULL/SRUDC}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EPCFormer: Expression <span class="highlight-title">Prompt</span> Collaboration <span class="highlight-title">Transformer</span> for Universal
+  Referring Video Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Chen, Jiacheng Lin, Zhiqiang Xiao, Haolong Fu, Ke Nai, Kailun Yang, Zhiyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-guided Video Object Segmentation (A-VOS) and Referring Video Object
+Segmentation (R-VOS) are two highly-related tasks, which both aim to segment
+specific objects from video sequences according to user-provided expression
+prompts. However, due to the challenges in modeling representations for
+different modalities, contemporary methods struggle to strike a balance between
+interaction flexibility and high-precision localization and segmentation. In
+this paper, we address this problem from two perspectives: the alignment
+representation of audio and text and the deep interaction among audio, text,
+and visual features. First, we propose a universal architecture, the Expression
+Prompt Collaboration Transformer, herein EPCFormer. Next, we propose an
+Expression Alignment (EA) mechanism for audio and text expressions. By
+introducing contrastive learning for audio and text expressions, the proposed
+EPCFormer realizes comprehension of the semantic equivalence between audio and
+text expressions denoting the same objects. Then, to facilitate deep
+interactions among audio, text, and video features, we introduce an
+Expression-Visual Attention (EVA) mechanism. The knowledge of video object
+segmentation in terms of the expression prompts can seamlessly transfer between
+the two tasks by deeply exploring complementary cues between text and audio.
+Experiments on well-recognized benchmarks demonstrate that our universal
+EPCFormer attains state-of-the-art results on both tasks. The source code of
+EPCFormer will be made publicly available at
+https://github.com/lab206/EPCFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code will be made publicly available at
+  https://github.com/lab206/EPCFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Top-Down Stereoscopic Image Quality Assessment via Stereo
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huilin Zhang, Sumei Li, Yongli Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stereoscopic image quality assessment (SIQA) plays a crucial role in
+evaluating and improving the visual experience of 3D content. Existing
+binocular properties and attention-based methods for SIQA have achieved
+promising performance. However, these bottom-up approaches are inadequate in
+exploiting the inherent characteristics of the human visual system (HVS). This
+paper presents a novel network for SIQA via stereo attention, employing a
+top-down perspective to guide the quality assessment process. Our proposed
+method realizes the guidance from high-level binocular signals down to
+low-level monocular signals, while the binocular and monocular information can
+be calibrated progressively throughout the processing pipeline. We design a
+generalized Stereo AttenTion (SAT) block to implement the top-down philosophy
+in stereo perception. This block utilizes the fusion-generated attention map as
+a high-level binocular modulator, influencing the representation of two
+low-level monocular features. Additionally, we introduce an Energy Coefficient
+(EC) to account for recent findings indicating that binocular responses in the
+primate primary visual cortex are less than the sum of monocular responses. The
+adaptive EC can tune the magnitude of binocular response flexibly, thus
+enhancing the formation of robust binocular features within our framework. To
+extract the most discriminative quality information from the summation and
+subtraction of the two branches of monocular features, we utilize a
+dual-pooling strategy that applies min-pooling and max-pooling operations to
+the respective branches. Experimental results highlight the superiority of our
+top-down method in simulating the property of visual perception and advancing
+the state-of-the-art in the SIQA field. The code of this work is available at
+https://github.com/Fanning-Zhang/SATNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Vision-Language Models to Follow Interleaved Vision-Language
+  Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, Yueting Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have recently sparked significant
+interest, which demonstrates emergent capabilities to serve as a
+general-purpose model for various vision-language tasks. However, existing
+methods mainly focus on limited types of instructions with a single image as
+visual context, which hinders the widespread availability of MLLMs. In this
+paper, we introduce the I4 benchmark to comprehensively evaluate the
+instruction following ability on complicated interleaved vision-language
+instructions, which involve intricate image-text sequential context, covering a
+diverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture
+slides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a
+common defect of existing methods: the Visual Prompt Generator (VPG) trained on
+image-captioning alignment objective tends to attend to common foreground
+information for captioning but struggles to extract specific information
+required by particular tasks. To address this issue, we propose a generic and
+lightweight controllable knowledge re-injection module, which utilizes the
+sophisticated reasoning ability of LLMs to control the VPG to conditionally
+extract instruction-specific visual information and re-inject it into the LLM.
+Further, we introduce an annotation-free cross-attention guided counterfactual
+image training strategy to methodically learn the proposed module by
+collaborating a cascade of foundation models. Enhanced by the proposed module
+and training strategy, we present Cheetah, a MLLM that can effectively handle a
+wide variety of interleaved vision-language instructions and achieves
+state-of-the-art zero-shot performance across all tasks of I4, without
+high-quality multimodal instruction tuning data. Moreover, Cheetah also
+exhibits competitive performance compared with state-of-the-art instruction
+tuned models on concurrent MME benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application for White Spot Syndrome Virus (WSSV) Monitoring using Edge
+  Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo S. Querol, Macario O. Cordel II, Dan Jeric A. Rustia, Mary Nia M. Santos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aquaculture industry, strongly reliant on shrimp exports, faces
+challenges due to viral infections like the White Spot Syndrome Virus (WSSV)
+that severely impact output yields. In this context, computer vision can play a
+significant role in identifying features not immediately evident to skilled or
+untrained eyes, potentially reducing the time required to report WSSV
+infections. In this study, the challenge of limited data for WSSV recognition
+was addressed. A mobile application dedicated to data collection and monitoring
+was developed to facilitate the creation of an image dataset to train a WSSV
+recognition model and improve country-wide disease surveillance. The study also
+includes a thorough analysis of WSSV recognition to address the challenge of
+imbalanced learning and on-device inference. The models explored,
+MobileNetV3-Small and EfficientNetV2-B0, gained an F1-Score of 0.72 and 0.99
+respectively. The saliency heatmaps of both models were also observed to
+uncover the "black-box" nature of these models and to gain insight as to what
+features in the images are most important in making a prediction. These results
+highlight the effectiveness and limitations of using models designed for
+resource-constrained devices and balancing their performance in accurately
+recognizing WSSV, providing valuable information and direction in the use of
+computer vision in this domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class-level Structural Relation Modelling and Smoothing for Visual
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zitan Chen, Zhuang Qi, Xiao Cao, Xiangxian Li, Xiangxu Meng, Lei Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning for images has been advanced by recent progress in
+more complex neural models such as the Vision Transformers and new learning
+theories such as the structural causal models. However, these models mainly
+rely on the classification loss to implicitly regularize the class-level data
+distributions, and they may face difficulties when handling classes with
+diverse visual patterns. We argue that the incorporation of the structural
+information between data samples may improve this situation. To achieve this
+goal, this paper presents a framework termed \textbf{C}lass-level Structural
+Relation Modeling and Smoothing for Visual Representation Learning (CSRMS),
+which includes the Class-level Relation Modelling, Class-aware Graph Sampling,
+and Relational Graph-Guided Representation Learning modules to model a
+relational graph of the entire dataset and perform class-aware smoothing and
+regularization operations to alleviate the issue of intra-class visual
+diversity and inter-class similarity. Specifically, the Class-level Relation
+Modelling module uses a clustering algorithm to learn the data distributions in
+the feature space and identify three types of class-level sample relations for
+the training set; Class-aware Graph Sampling module extends typical training
+batch construction process with three strategies to sample dataset-level
+sub-graphs; and Relational Graph-Guided Representation Learning module employs
+a graph convolution network with knowledge-guided smoothing operations to ease
+the projection from different visual patterns to the same class. Experiments
+demonstrate the effectiveness of structured knowledge modelling for enhanced
+representation learning and show that CSRMS can be incorporated with any
+state-of-the-art visual representation learning models for performance gains.
+The source codes and demos have been released at
+https://github.com/czt117/CSRMS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Assessment of the Performance of Deep Learning Classifiers
+  Reveals a Surprising Lack of Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Spratling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable and robust evaluation methods are a necessary first step towards
+developing machine learning models that are themselves robust and reliable.
+Unfortunately, current evaluation protocols typically used to assess
+classifiers fail to comprehensively evaluate performance as they tend to rely
+on limited types of test data, and ignore others. For example, using the
+standard test data fails to evaluate the predictions made by the classifier to
+samples from classes it was not trained on. On the other hand, testing with
+data containing samples from unknown classes fails to evaluate how well the
+classifier can predict the labels for known classes. This article advocates
+bench-marking performance using a wide range of different types of data and
+using a single metric that can be applied to all such data types to produce a
+consistent evaluation of performance. Using such a benchmark it is found that
+current deep neural networks, including those trained with methods that are
+believed to produce state-of-the-art robustness, are extremely vulnerable to
+making mistakes on certain types of data. This means that such models will be
+unreliable in real-world scenarios where they may encounter data from many
+different domains, and that they are insecure as they can easily be fooled into
+making the wrong decisions. It is hoped that these results will motivate the
+wider adoption of more comprehensive testing methods that will, in turn, lead
+to the development of more robust machine learning methods in the future.
+  Code is available at:
+\url{https://codeberg.org/mwspratling/RobustnessEvaluation}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion
+  and Infinite Data Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyang Yu, Shihao Wang, Yuan Fang, Wangpeng An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OmniDataComposer, an innovative approach for multimodal
+data fusion and unlimited data generation with an intent to refine and
+uncomplicate interplay among diverse data modalities. Coming to the core
+breakthrough, it introduces a cohesive data structure proficient in processing
+and merging multimodal data inputs, which include video, audio, and text. Our
+crafted algorithm leverages advancements across multiple operations such as
+video/image caption extraction, dense caption extraction, Automatic Speech
+Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything
+Model(RAM), and object tracking. OmniDataComposer is capable of identifying
+over 6400 categories of objects, substantially broadening the spectrum of
+visual information. It amalgamates these diverse modalities, promoting
+reciprocal enhancement among modalities and facilitating cross-modal data
+correction. \textbf{The final output metamorphoses each video input into an
+elaborate sequential document}, virtually transmuting videos into thorough
+narratives, making them easier to be processed by large language models. Future
+prospects include optimizing datasets for each modality to encourage unlimited
+data generation. This robust base will offer priceless insights to models like
+ChatGPT, enabling them to create higher quality datasets for video captioning
+and easing question-answering tasks based on video content. OmniDataComposer
+inaugurates a new stage in multimodal learning, imparting enormous potential
+for augmenting AI's understanding and generation of complex, real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Color Recommendation in Vector Graphic Documents <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianru Qiu, Xueting Wang, Mayu Otani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Color selection plays a critical role in graphic document design and requires
+sufficient consideration of various contexts. However, recommending appropriate
+colors which harmonize with the other colors and textual contexts in documents
+is a challenging task, even for experienced designers. In this study, we
+propose a multimodal masked color model that integrates both color and textual
+contexts to provide text-aware color recommendation for graphic documents. Our
+proposed model comprises self-attention networks to capture the relationships
+between colors in multiple palettes, and cross-attention networks that
+incorporate both color and CLIP-based text representations. Our proposed method
+primarily focuses on color palette completion, which recommends colors based on
+the given colors and text. Additionally, it is applicable for another color
+recommendation task, full palette generation, which generates a complete color
+palette corresponding to the given text. Experimental results demonstrate that
+our proposed approach surpasses previous color palette completion methods on
+accuracy, color distribution, and user experience, as well as full palette
+generation methods concerning color diversity and similarity to the ground
+truth palettes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Unimodal to Multimodal: improving the sEMG-Based Pattern
+  Recognition via deep generative models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Wei, Linyan Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal hand gesture recognition (HGR) systems can achieve higher
+recognition accuracy. However, acquiring multimodal gesture recognition data
+typically requires users to wear additional sensors, thereby increasing
+hardware costs. This paper proposes a novel generative approach to improve
+Surface Electromyography (sEMG)-based HGR accuracy via virtual Inertial
+Measurement Unit (IMU) signals. Specifically, we trained a deep generative
+model based on the intrinsic correlation between forearm sEMG signals and
+forearm IMU signals to generate virtual forearm IMU signals from the input
+forearm sEMG signals at first. Subsequently, the sEMG signals and virtual IMU
+signals were fed into a multimodal Convolutional Neural Network (CNN) model for
+gesture recognition. To evaluate the performance of the proposed approach, we
+conducted experiments on 6 databases, including 5 publicly available databases
+and our collected database comprising 28 subjects performing 38 gestures,
+containing both sEMG and IMU data. The results show that our proposed approach
+outperforms the sEMG-based unimodal HGR method (with increases of
+2.15%-13.10%). It demonstrates that incorporating virtual IMU signals,
+generated by deep generative models, can significantly enhance the accuracy of
+sEMG-based HGR. The proposed approach represents a successful attempt to
+transition from unimodal HGR to multimodal HGR without additional sensor
+hardware.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Gaussian Splatting for Real-Time Radiance Field Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernhard Kerbl, Georgios Kopanas, Thomas Leimkühler, George Drettakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiance Field methods have recently revolutionized novel-view synthesis of
+scenes captured with multiple photos or videos. However, achieving high visual
+quality still requires neural networks that are costly to train and render,
+while recent faster methods inevitably trade off speed for quality. For
+unbounded and complete scenes (rather than isolated objects) and 1080p
+resolution rendering, no current method can achieve real-time display rates. We
+introduce three key elements that allow us to achieve state-of-the-art visual
+quality while maintaining competitive training times and importantly allow
+high-quality real-time (>= 30 fps) novel-view synthesis at 1080p resolution.
+First, starting from sparse points produced during camera calibration, we
+represent the scene with 3D Gaussians that preserve desirable properties of
+continuous volumetric radiance fields for scene optimization while avoiding
+unnecessary computation in empty space; Second, we perform interleaved
+optimization/density control of the 3D Gaussians, notably optimizing
+anisotropic covariance to achieve an accurate representation of the scene;
+Third, we develop a fast visibility-aware rendering algorithm that supports
+anisotropic splatting and both accelerates training and allows realtime
+rendering. We demonstrate state-of-the-art visual quality and real-time
+rendering on several established datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Spatial-Temporal Context for Interacting Hand Reconstruction
+  on Monocular RGB Video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichao Zhao, Hezhen Hu, Wengang Zhou, Li li, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing interacting hands from monocular RGB data is a challenging
+task, as it involves many interfering factors, e.g. self- and mutual occlusion
+and similar textures. Previous works only leverage information from a single
+RGB image without modeling their physically plausible relation, which leads to
+inferior reconstruction results. In this work, we are dedicated to explicitly
+exploiting spatial-temporal information to achieve better interacting hand
+reconstruction. On one hand, we leverage temporal context to complement
+insufficient information provided by the single frame, and design a novel
+temporal framework with a temporal constraint for interacting hand motion
+smoothness. On the other hand, we further propose an interpenetration detection
+module to produce kinetically plausible interacting hands without physical
+collisions. Extensive experiments are performed to validate the effectiveness
+of our proposed framework, which achieves new state-of-the-art performance on
+public benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConDistFL: Conditional Distillation for Federated Learning from
+  Partially Annotated Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pochuan Wang, Chen Shen, Weichung Wang, Masahiro Oda, Chiou-Shann Fuh, Kensaku Mori, Holger R. Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a generalized segmentation model capable of simultaneously
+delineating multiple organs and diseases is highly desirable. Federated
+learning (FL) is a key technology enabling the collaborative development of a
+model without exchanging training data. However, the limited access to fully
+annotated training data poses a major challenge to training generalizable
+models. We propose "ConDistFL", a framework to solve this problem by combining
+FL with knowledge distillation. Local models can extract the knowledge of
+unlabeled organs and tumors from partially annotated data from the global model
+with an adequately designed conditional probability representation. We validate
+our framework on four distinct partially annotated abdominal CT datasets from
+the MSD and KiTS19 challenges. The experimental results show that the proposed
+framework significantly outperforms FedAvg and FedOpt baselines. Moreover, the
+performance on an external test dataset demonstrates superior generalizability
+compared to models trained on each dataset separately. Our ablation study
+suggests that ConDistFL can perform well without frequent aggregation, reducing
+the communication cost of FL. Our implementation will be available at
+https://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Analysis of Range for 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neehar Peri, Mengtian Li, Benjamin Wilson, Yu-Xiong Wang, James Hays, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR-based 3D detection plays a vital role in autonomous navigation.
+Surprisingly, although autonomous vehicles (AVs) must detect both near-field
+objects (for collision avoidance) and far-field objects (for longer-term
+planning), contemporary benchmarks focus only on near-field 3D detection.
+However, AVs must detect far-field objects for safe navigation. In this paper,
+we present an empirical analysis of far-field 3D detection using the long-range
+detection dataset Argoverse 2.0 to better understand the problem, and share the
+following insight: near-field LiDAR measurements are dense and optimally
+encoded by small voxels, while far-field measurements are sparse and are better
+encoded with large voxels. We exploit this observation to build a collection of
+range experts tuned for near-vs-far field detection, and propose simple
+techniques to efficiently ensemble models for long-range detection that improve
+efficiency by 33% and boost accuracy by 3.2% CDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023 Workshop - Robustness and Reliability of
+  Autonomous Vehicles in the Open-World</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Five-Dollar Model: Generating Game Maps and Sprites from Sentence
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Merino, Roman Negri, Dipika Rajesh, M Charity, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The five-dollar model is a lightweight text-to-image generative architecture
+that generates low dimensional images from an encoded text prompt. This model
+can successfully generate accurate and aesthetically pleasing content in low
+dimensional domains, with limited amounts of training data. Despite the small
+size of both the model and datasets, the generated images are still able to
+maintain the encoded semantic meaning of the textual prompt. We apply this
+model to three small datasets: pixel art video game maps, video game sprite
+images, and down-scaled emoji images and apply novel augmentation strategies to
+improve the performance of our model on these limited datasets. We evaluate our
+models performance using cosine similarity score between text-image pairs
+generated by the CLIP VIT-B/32 model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in AIIDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SODFormer: Streaming Object Detection with <span class="highlight-title">Transformer</span> Using Events and
+  Frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dianze Li, Jianing Li, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DAVIS camera, streaming two complementary sensing modalities of asynchronous
+events and frames, has gradually been used to address major object detection
+challenges (e.g., fast motion blur and low-light). However, how to effectively
+leverage rich temporal cues and fuse two heterogeneous visual streams remains a
+challenging endeavor. To address this challenge, we propose a novel streaming
+object detector with Transformer, namely SODFormer, which first integrates
+events and frames to continuously detect objects in an asynchronous manner.
+Technically, we first build a large-scale multimodal neuromorphic object
+detection dataset (i.e., PKU-DAVIS-SOD) over 1080.1k manual labels. Then, we
+design a spatiotemporal Transformer architecture to detect objects via an
+end-to-end sequence prediction problem, where the novel temporal Transformer
+module leverages rich temporal cues from two visual streams to improve the
+detection performance. Finally, an asynchronous attention-based fusion module
+is proposed to integrate two heterogeneous sensing modalities and take
+complementary advantages from each end, which can be queried at any time to
+locate objects and break through the limited output frequency from synchronized
+frame-based fusion strategies. The results show that the proposed SODFormer
+outperforms four state-of-the-art methods and our eight baselines by a
+significant margin. We also show that our unifying framework works well even in
+cases where the conventional frame-based camera fails, e.g., high-speed motion
+and low-light conditions. Our dataset and code can be available at
+https://github.com/dianzl/SODFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 15 figures, in IEEE Transactions on Pattern Analysis and
+  Machine Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit neural representations for joint decomposition and registration
+  of gene expression images in the marmoset brain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Byra, Charissa Poon, Tomomi Shimogori, Henrik Skibbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel image registration method based on implicit neural
+representations that addresses the challenging problem of registering a pair of
+brain images with similar anatomical structures, but where one image contains
+additional features or artifacts that are not present in the other image. To
+demonstrate its effectiveness, we use 2D microscopy $\textit{in situ}$
+hybridization gene expression images of the marmoset brain. Accurately
+quantifying gene expression requires image registration to a brain template,
+which is difficult due to the diversity of patterns causing variations in
+visible anatomical brain structures. Our approach uses implicit networks in
+combination with an image exclusion loss to jointly perform the registration
+and decompose the image into a support and residual image. The support image
+aligns well with the template, while the residual image captures individual
+image characteristics that diverge from the template. In experiments, our
+method provided excellent results and outperformed other registration
+techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Augmentation with Large-scale Unconditional <span class="highlight-title">Pre-train</span>ing <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarong Ye, Haomiao Ni, Peng Jin, Sharon X. Huang, Yuan Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning based medical image recognition systems often require a
+substantial amount of training data with expert annotations, which can be
+expensive and time-consuming to obtain. Recently, synthetic augmentation
+techniques have been proposed to mitigate the issue by generating realistic
+images conditioned on class labels. However, the effectiveness of these methods
+heavily depends on the representation capability of the trained generative
+model, which cannot be guaranteed without sufficient labeled training data. To
+further reduce the dependency on annotated data, we propose a synthetic
+augmentation method called HistoDiffusion, which can be pre-trained on
+large-scale unlabeled datasets and later applied to a small-scale labeled
+dataset for augmented training. In particular, we train a latent diffusion
+model (LDM) on diverse unlabeled datasets to learn common features and generate
+realistic images without conditional inputs. Then, we fine-tune the model with
+classifier guidance in latent space on an unseen labeled dataset so that the
+model can synthesize images of specific categories. Additionally, we adopt a
+selective mechanism to only add synthetic samples with high confidence of
+matching to target labels. We evaluate our proposed method by pre-training on
+three histopathology datasets and testing on a histopathology dataset of
+colorectal cancer (CRC) excluded from the pre-training datasets. With
+HistoDiffusion augmentation, the classification accuracy of a backbone
+classifier is remarkably improved by 6.4% using a small set of the original
+labels. Our code is available at https://github.com/karenyyy/HistoDiffAug.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Visual Primitive Experts for Compositional Zero-Shot
+  Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjae Kim, Jiyoung Lee, Seongheon Park, Kwanghoon Sohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional zero-shot learning (CZSL) aims to recognize unseen compositions
+with prior knowledge of known primitives (attribute and object). Previous works
+for CZSL often suffer from grasping the contextuality between attribute and
+object, as well as the discriminability of visual features, and the long-tailed
+distribution of real-world compositional data. We propose a simple and scalable
+framework called Composition Transformer (CoT) to address these issues. CoT
+employs object and attribute experts in distinctive manners to generate
+representative embeddings, using the visual network hierarchically. The object
+expert extracts representative object embeddings from the final layer in a
+bottom-up manner, while the attribute expert makes attribute embeddings in a
+top-down manner with a proposed object-guided attention module that models
+contextuality explicitly. To remedy biased prediction caused by imbalanced data
+distribution, we develop a simple minority attribute augmentation (MAA) that
+synthesizes virtual samples by mixing two images and oversampling minority
+attribute classes. Our method achieves SoTA performance on several benchmarks,
+including MIT-States, C-GQA, and VAW-CZSL. We also demonstrate the
+effectiveness of CoT in improving visual discrimination and addressing the
+model bias from the imbalanced data distribution. The code is available at
+https://github.com/HanjaeKim98/CoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coarse-to-Fine: Learning Compact Discriminative Representation for
+  Single-Stage Image Retrieval <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunquan Zhu, Xinkai Gao, Bo Ke, Ruizhi Qiao, Xing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image retrieval targets to find images from a database that are visually
+similar to the query image. Two-stage methods following retrieve-and-rerank
+paradigm have achieved excellent performance, but their separate local and
+global modules are inefficient to real-world applications. To better trade-off
+retrieval efficiency and accuracy, some approaches fuse global and local
+feature into a joint representation to perform single-stage image retrieval.
+However, they are still challenging due to various situations to tackle,
+$e.g.$, background, occlusion and viewpoint. In this work, we design a
+Coarse-to-Fine framework to learn Compact Discriminative representation (CFCD)
+for end-to-end single-stage image retrieval-requiring only image-level labels.
+Specifically, we first design a novel adaptive softmax-based loss which
+dynamically tunes its scale and margin within each mini-batch and increases
+them progressively to strengthen supervision during training and intra-class
+compactness. Furthermore, we propose a mechanism which attentively selects
+prominent local descriptors and infuse fine-grained semantic relations into the
+global representation by a hard negative sampling strategy to optimize
+inter-class distinctiveness at a global scale. Extensive experimental results
+have demonstrated the effectiveness of our method, which achieves
+state-of-the-art single-stage image retrieval performance on benchmarks such as
+Revisited Oxford and Revisited Paris. Code is available at
+https://github.com/bassyess/CFCD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-shot medical image classification with simple shape and texture text
+  descriptors using vision-language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Byra, Muhammad Febrian Rachmadi, Henrik Skibbe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the usefulness of vision-language models (VLMs)
+and large language models for binary few-shot classification of medical images.
+We utilize the GPT-4 model to generate text descriptors that encapsulate the
+shape and texture characteristics of objects in medical images. Subsequently,
+these GPT-4 generated descriptors, alongside VLMs pre-trained on natural
+images, are employed to classify chest X-rays and breast ultrasound images. Our
+results indicate that few-shot classification of medical images using VLMs and
+GPT-4 generated descriptors is a viable approach. However, accurate
+classification requires to exclude certain descriptors from the calculations of
+the classification scores. Moreover, we assess the ability of VLMs to evaluate
+shape features in breast mass ultrasound images. We further investigate the
+degree of variability among the sets of text descriptors produced by GPT-4. Our
+work provides several important insights about the application of VLMs for
+medical image analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding CNN Hidden Neuron Activations using Structured Background
+  Knowledge and Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilekha Dalal, Md Kamruzzaman Sarker, Adrita Barua, Eugene Vasserman, Pascal Hitzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Explainable AI is in correctly interpreting activations
+of hidden neurons: accurate interpretations would provide insights into the
+question of what a deep learning system has internally detected as relevant on
+the input, de-mystifying the otherwise black-box character of deep learning
+systems. The state of the art indicates that hidden node activations can, in
+some cases, be interpretable in a way that makes sense to humans, but
+systematic automated methods that would be able to hypothesize and verify
+interpretations of hidden neuron activations are underexplored. In this paper,
+we provide such a method and demonstrate that it provides meaningful
+interpretations. Our approach is based on using large-scale background
+knowledge approximately 2 million classes curated from the Wikipedia concept
+hierarchy together with a symbolic reasoning approach called Concept Induction
+based on description logics, originally developed for applications in the
+Semantic Web field. Our results show that we can automatically attach
+meaningful labels from the background knowledge to individual neurons in the
+dense layer of a Convolutional Neural Network through a hypothesis and
+verification process
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Strawberry Detection Based on Improved YOLOv5s Architecture
+  for Robotic Harvesting in open-field environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan He, Salik Ram Khana, Xin Zhang, Manoj Karkee, Qin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposed a YOLOv5-based custom object detection model to detect
+strawberries in an outdoor environment. The original architecture of the
+YOLOv5s was modified by replacing the C3 module with the C2f module in the
+backbone network, which provided a better feature gradient flow. Secondly, the
+Spatial Pyramid Pooling Fast in the final layer of the backbone network of
+YOLOv5s was combined with Cross Stage Partial Net to improve the generalization
+ability over the strawberry dataset in this study. The proposed architecture
+was named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with
+three maturity classes (immature, nearly mature, and mature) was collected in
+open-field environment and augmented through a series of operations including
+brightness reduction, brightness increase, and noise adding. To verify the
+superiority of the proposed method for strawberry detection in open-field
+environment, four competitive detection models (YOLOv3-tiny, YOLOv5s,
+YOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational
+environment and compared with YOLOv5s-Straw. The results showed that the
+highest mean average precision of 80.3% was achieved using the proposed
+architecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,
+YOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.
+Specifically, the average precision of YOLOv5s-Straw was 82.1% in the immature
+class, 73.5% in the nearly mature class, and 86.6% in the mature class, which
+were 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The
+model included 8.6*10^6 network parameters with an inference speed of 18ms per
+image while the inference speed of YOLOv8s had a slower inference speed of
+21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed
+model is fast enough for real time strawberry detection and localization for
+the robotic picking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages; 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PARTNER: Level up the Polar Representation for LiDAR 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Nie, Yujing Xue, Chunwei Wang, Chaoqiang Ye, Hang Xu, Xinge Zhu, Qingqiu Huang, Michael Bi Mi, Xinchao Wang, Li Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, polar-based representation has shown promising properties in
+perceptual tasks. In addition to Cartesian-based approaches, which separate
+point clouds unevenly, representing point clouds as polar grids has been
+recognized as an alternative due to (1) its advantage in robust performance
+under different resolutions and (2) its superiority in streaming-based
+approaches. However, state-of-the-art polar-based detection methods inevitably
+suffer from the feature distortion problem because of the non-uniform division
+of polar representation, resulting in a non-negligible performance gap compared
+to Cartesian-based approaches. To tackle this issue, we present PARTNER, a
+novel 3D object detector in the polar coordinate. PARTNER alleviates the
+dilemma of feature distortion with global representation re-alignment and
+facilitates the regression by introducing instance-level geometric information
+into the detection head. Extensive experiments show overwhelming advantages in
+streaming-based detection and different resolutions. Furthermore, our method
+outperforms the previous polar-based works with remarkable margins of 3.68% and
+9.15% on Waymo and ONCE validation set, thus achieving competitive results over
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAIF: Perception-Aware Infrared-Visible Image Fusion for Attack-Tolerant
+  Semantic Segmentation <span class="chip">ACM MM'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhu Liu, Jinyuan Liu, Benzhuang Zhang, Long Ma, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared and visible image fusion is a powerful technique that combines
+complementary information from different modalities for downstream semantic
+perception tasks. Existing learning-based methods show remarkable performance,
+but are suffering from the inherent vulnerability of adversarial attacks,
+causing a significant decrease in accuracy. In this work, a perception-aware
+fusion framework is proposed to promote segmentation robustness in adversarial
+scenes. We first conduct systematic analyses about the components of image
+fusion, investigating the correlation with segmentation robustness under
+adversarial perturbations. Based on these analyses, we propose a harmonized
+architecture search with a decomposition-based structure to balance standard
+accuracy and robustness. We also propose an adaptive learning strategy to
+improve the parameter robustness of image fusion, which can learn effective
+feature extraction under diverse adversarial perturbations. Thus, the goals of
+image fusion (\textit{i.e.,} extracting complementary features from source
+modalities and defending attack) can be realized from the perspectives of
+architectural and learning strategies. Extensive experimental results
+demonstrate that our scheme substantially enhances the robustness, with gains
+of 15.3% mIOU of segmentation in the adversarial scene, compared with advanced
+competitors. The source codes are available at
+https://github.com/LiuZhu-CV/PAIF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM'2023;The source codes are available at
+  https://github.com/LiuZhu-CV/PAIF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PUG: Photorealistic and Semantically Controllable Synthetic Data for
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Bordes, Shashank Shekhar, Mark Ibrahim, Diane Bouchacourt, Pascal Vincent, Ari S. Morcos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic image datasets offer unmatched advantages for designing and
+evaluating deep neural networks: they make it possible to (i) render as many
+data samples as needed, (ii) precisely control each scene and yield granular
+ground truth labels (and captions), (iii) precisely control distribution shifts
+between training and testing to isolate variables of interest for sound
+experimentation. Despite such promise, the use of synthetic image data is still
+limited -- and often played down -- mainly due to their lack of realism. Most
+works therefore rely on datasets of real images, which have often been scraped
+from public images on the internet, and may have issues with regards to
+privacy, bias, and copyright, while offering little control over how objects
+precisely appear. In this work, we present a path to democratize the use of
+photorealistic synthetic data: we develop a new generation of interactive
+environments for representation learning research, that offer both
+controllability and realism. We use the Unreal Engine, a powerful game engine
+well known in the entertainment industry, to produce PUG (Photorealistic Unreal
+Graphics) environments and datasets for representation learning. In this paper,
+we demonstrate the potential of PUG to enable more rigorous evaluations of
+vision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>ed Contrast with Masked Motion Modeling: Towards Versatile 3D
+  Action Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Zhang, Lilang Lin, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning has proved effective for skeleton-based human action
+understanding, which is an important yet challenging topic. Previous works
+mainly rely on contrastive learning or masked motion modeling paradigm to model
+the skeleton relations. However, the sequence-level and joint-level
+representation learning cannot be effectively and simultaneously handled by
+these methods. As a result, the learned representations fail to generalize to
+different downstream tasks. Moreover, combining these two paradigms in a naive
+manner leaves the synergy between them untapped and can lead to interference in
+training. To address these problems, we propose Prompted Contrast with Masked
+Motion Modeling, PCM$^{\rm 3}$, for versatile 3D action representation
+learning. Our method integrates the contrastive learning and masked prediction
+tasks in a mutually beneficial manner, which substantially boosts the
+generalization capacity for various downstream tasks. Specifically, masked
+prediction provides novel training views for contrastive learning, which in
+turn guides the masked prediction training with high-level semantic
+information. Moreover, we propose a dual-prompted multi-task pretraining
+strategy, which further improves model representations by reducing the
+interference caused by learning the two different pretext tasks. Extensive
+experiments on five downstream tasks under three large-scale datasets are
+conducted, demonstrating the superior generalization capacity of PCM$^{\rm 3}$
+compared to the state-of-the-art works. Our project is publicly available at:
+https://jhang2020.github.io/Projects/PCM3/PCM3.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheXFusion: Effective Fusion of Multi-View Features using <span class="highlight-title">Transformer</span>s
+  for Long-Tailed Chest X-Ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongkyun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification poses unique challenges due to the long-tailed
+distribution of diseases, the co-occurrence of diagnostic findings, and the
+multiple views available for each study or patient. This paper introduces our
+solution to the ICCV CVAMD 2023 Shared Task on CXR-LT: Multi-Label Long-Tailed
+Classification on Chest X-Rays. Our approach introduces CheXFusion, a
+transformer-based fusion module incorporating multi-view images. The fusion
+module, guided by self-attention and cross-attention mechanisms, efficiently
+aggregates multi-view features while considering label co-occurrence.
+Furthermore, we explore data balancing and self-training methods to optimize
+the model's performance. Our solution achieves state-of-the-art results with
+0.372 mAP in the MIMIC-CXR test set, securing 1st place in the competition. Our
+success in the task underscores the significance of considering multi-view
+settings, class imbalance, and label co-occurrence in medical image
+classification. Public code is available at
+https://github.com/dongkyuk/CXR-LT-public-solution
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YOLOCS: Object Detection based on Dense Channel Compression for Feature
+  Spatial Solidification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04170v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04170v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Huang, Weisheng Li, Linlin Shen, Haojie Fu, Xue Xiao, Suihan Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we examine the associations between channel features and
+convolutional kernels during the processes of feature purification and gradient
+backpropagation, with a focus on the forward and backward propagation within
+the network. Consequently, we propose a method called Dense Channel Compression
+for Feature Spatial Solidification. Drawing upon the central concept of this
+method, we introduce two innovative modules for backbone and head networks: the
+Dense Channel Compression for Feature Spatial Solidification Structure (DCFS)
+and the Asymmetric Multi-Level Compression Decoupled Head (ADH). When
+integrated into the YOLOv5 model, these two modules demonstrate exceptional
+performance, resulting in a modified model referred to as YOLOCS. Evaluated on
+the MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of
+50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably
+similar to those of the YOLOv5 model, the large, medium, and small YOLOCS
+models surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Data Attribution for Text-to-Image Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng-Yu Wang, Alexei A. Efros, Jun-Yan Zhu, Richard Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large text-to-image models are able to synthesize "novel" images, these
+images are necessarily a reflection of the training data. The problem of data
+attribution in such models -- which of the images in the training set are most
+responsible for the appearance of a given generated image -- is a difficult yet
+important one. As an initial step toward this problem, we evaluate attribution
+through "customization" methods, which tune an existing large-scale model
+toward a given exemplar object or style. Our key insight is that this allows us
+to efficiently create synthetic images that are computationally influenced by
+the exemplar by construction. With our new dataset of such exemplar-influenced
+images, we are able to evaluate various data attribution algorithms and
+different possible feature spaces. Furthermore, by training on our dataset, we
+can tune standard models, such as DINO, CLIP, and ViT, toward the attribution
+problem. Even though the procedure is tuned towards small exemplar sets, we
+show generalization to larger sets. Finally, by taking into account the
+inherent uncertainty of the problem, we can assign soft attribution scores over
+a set of training images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated v2 -- ICCV 2023 camera ready version. Project page:
+  https://peterwang512.github.io/GenDataAttribution Code:
+  https://github.com/PeterWang512/GenDataAttribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Spectral Denoising <span class="highlight-title">Transformer</span> with Guided Attention <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeqiang Lai, Chenggang Yan, Ying Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a Hybrid Spectral Denoising Transformer (HSDT) for
+hyperspectral image denoising. Challenges in adapting transformer for HSI arise
+from the capabilities to tackle existing limitations of CNN-based methods in
+capturing the global and local spatial-spectral correlations while maintaining
+efficiency and flexibility. To address these issues, we introduce a hybrid
+approach that combines the advantages of both models with a Spatial-Spectral
+Separable Convolution (S3Conv), Guided Spectral Self-Attention (GSSA), and
+Self-Modulated Feed-Forward Network (SM-FFN). Our S3Conv works as a lightweight
+alternative to 3D convolution, which extracts more spatial-spectral correlated
+features while keeping the flexibility to tackle HSIs with an arbitrary number
+of bands. These features are then adaptively processed by GSSA which per-forms
+3D self-attention across the spectral bands, guided by a set of learnable
+queries that encode the spectral signatures. This not only enriches our model
+with powerful capabilities for identifying global spectral correlations but
+also maintains linear complexity. Moreover, our SM-FFN proposes the
+self-modulation that intensifies the activations of more informative regions,
+which further strengthens the aggregated features. Extensive experiments are
+conducted on various datasets under both simulated and real-world noise, and it
+shows that our HSDT significantly outperforms the existing state-of-the-art
+methods while maintaining low computational overhead. Code is at https:
+//github.com/Zeqiang-Lai/HSDT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaitRef: Gait Recognition with Refined Sequential Skeletons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haidong Zhu, Wanrong Zheng, Zhaoheng Zheng, Ram Nevatia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying humans with their walking sequences, known as gait recognition,
+is a useful biometric understanding task as it can be observed from a long
+distance and does not require cooperation from the subject. Two common
+modalities used for representing the walking sequence of a person are
+silhouettes and joint skeletons. Silhouette sequences, which record the
+boundary of the walking person in each frame, may suffer from the variant
+appearances from carried-on objects and clothes of the person. Framewise joint
+detections are noisy and introduce some jitters that are not consistent with
+sequential detections. In this paper, we combine the silhouettes and skeletons
+and refine the framewise joint predictions for gait recognition. With temporal
+information from the silhouette sequences, we show that the refined skeletons
+can improve gait recognition performance without extra annotations. We compare
+our methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCB 2023 oral. Code is available at
+  https://github.com/haidongz-usc/GaitRef</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PMAA: A Progressive Multi-scale Attention Autoencoder Model for
+  High-performance Cloud Removal from Multi-temporal Satellite Imagery <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Pin Tao, Yachao Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite imagery analysis plays a pivotal role in remote sensing; however,
+information loss due to cloud cover significantly impedes its application.
+Although existing deep cloud removal models have achieved notable outcomes,
+they scarcely consider contextual information. This study introduces a
+high-performance cloud removal architecture, termed Progressive Multi-scale
+Attention Autoencoder (PMAA), which concurrently harnesses global and local
+information to construct robust contextual dependencies using a novel
+Multi-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).
+PMAA establishes long-range dependencies of multi-scale features using MAM and
+modulates the reconstruction of fine-grained details utilizing LIM, enabling
+simultaneous representation of fine- and coarse-grained features at the same
+level. With the help of diverse and multi-scale features, PMAA consistently
+outperforms the previous state-of-the-art model CTGAN on two benchmark
+datasets. Moreover, PMAA boasts considerable efficiency advantages, with only
+0.5% and 14.6% of the parameters and computational complexity of CTGAN,
+respectively. These comprehensive results underscore PMAA's potential as a
+lightweight cloud removal network suitable for deployment on edge devices to
+accomplish large-scale cloud removal tasks. Our source code and pre-trained
+models are available at https://github.com/XavierJiezou/PMAA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Pan-sharpening with Memories of Spatial Details 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16181v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16181v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maoxun Yuan, Tianyi Zhao, Bo Li, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening, as one of the most commonly used techniques in remote sensing
+systems, aims to inject spatial details from panchromatic images into
+multispectral images (MS) to obtain high-resolution multispectral images. Since
+deep learning has received widespread attention because of its powerful fitting
+ability and efficient feature extraction, a variety of pan-sharpening methods
+have been proposed to achieve remarkable performance. However, current
+pan-sharpening methods usually require the paired panchromatic (PAN) and MS
+images as input, which limits their usage in some scenarios. To address this
+issue, in this paper we observe that the spatial details from PAN images are
+mainly high-frequency cues, i.e., the edges reflect the contour of input PAN
+images. This motivates us to develop a PAN-agnostic representation to store
+some base edges, so as to compose the contour for the corresponding PAN image
+via them. As a result, we can perform the pan-sharpening task with only the MS
+image when inference. To this end, a memory-based network is adapted to extract
+and memorize the spatial details during the training phase and is used to
+replace the process of obtaining spatial information from PAN images when
+inference, which is called Memory-based Spatial Details Network (MSDN).
+Finally, we integrate the proposed MSDN module into the existing deep
+learning-based pan-sharpening methods to achieve an end-to-end pan-sharpening
+network. With extensive experiments on the Gaofen1 and WorldView-4 satellites,
+we verify that our method constructs good spatial details without PAN images
+and achieves the best performance. The code is available at
+https://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffBFR: Bootstrapping Diffusion Model Towards Blind Face Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinmin Qiu, Congying Han, Zicheng Zhang, Bonan Li, Tiande Guo, Xuecheng Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind face restoration (BFR) is important while challenging. Prior works
+prefer to exploit GAN-based frameworks to tackle this task due to the balance
+of quality and efficiency. However, these methods suffer from poor stability
+and adaptability to long-tail distribution, failing to simultaneously retain
+source identity and restore detail. We propose DiffBFR to introduce Diffusion
+Probabilistic Model (DPM) for BFR to tackle the above problem, given its
+superiority over GAN in aspects of avoiding training collapse and generating
+long-tail distribution. DiffBFR utilizes a two-step design, that first restores
+identity information from low-quality images and then enhances texture details
+according to the distribution of real faces. This design is implemented with
+two key components: 1) Identity Restoration Module (IRM) for preserving the
+face details in results. Instead of denoising from pure Gaussian random
+distribution with LQ images as the condition during the reverse process, we
+propose a novel truncated sampling method which starts from LQ images with part
+noise added. We theoretically prove that this change shrinks the evidence lower
+bound of DPM and then restores more original details. With theoretical proof,
+two cascade conditional DPMs with different input sizes are introduced to
+strengthen this sampling effect and reduce training difficulty in the
+high-resolution image generated directly. 2) Texture Enhancement Module (TEM)
+for polishing the texture of the image. Here an unconditional DPM, a LQ-free
+model, is introduced to further force the restorations to appear realistic. We
+theoretically proved that this unconditional DPM trained on pure HQ images
+contributes to justifying the correct distribution of inference images output
+from IRM in pixel-level space. Truncated sampling with fractional time step is
+utilized to polish pixel-level textures while preserving identity information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas David, Helio Pedrini, Zanoni Dias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate the necessity for large amounts of supervised segmentation
+annotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)
+strategies have been devised. These will often rely on advanced data and model
+regularization strategies to instigate the development of useful properties
+(e.g., prediction completeness and fidelity to semantic boundaries) in
+segmentation priors, notwithstanding the lack of annotated information. In this
+work, we first create a strong baseline by analyzing complementary WSSS
+techniques and regularizing strategies, considering their strengths and
+limitations. We then propose a new Class-specific Adversarial Erasing strategy,
+comprising two adversarial CAM generating networks being gradually refined to
+produce robust semantic segmentation proposals. Empirical results suggest that
+our approach induces substantial improvement in the effectiveness of the
+baseline, resulting in a noticeable improvement over both Pascal VOC 2012 and
+MS COCO 2014 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial
+  Transferability From Surrogate Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07873v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07873v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechao Zhang, Shengshan Hu, Leo Yu Zhang, Junyu Shi, Minghui Li, Xiaogeng Liu, Wei Wan, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs
+that successfully fool white-box surrogate models can also deceive other
+black-box models with different architectures. Although a bunch of empirical
+studies have provided guidance on generating highly transferable AEs, many of
+these findings lack explanations and even lead to inconsistent advice. In this
+paper, we take a further step towards understanding adversarial
+transferability, with a particular focus on surrogate aspects. Starting from
+the intriguing little robustness phenomenon, where models adversarially trained
+with mildly perturbed adversarial samples can serve as better surrogates, we
+attribute it to a trade-off between two predominant factors: model smoothness
+and gradient similarity. Our investigations focus on their joint effects,
+rather than their separate correlations with transferability. Through a series
+of theoretical and empirical analyses, we conjecture that the data distribution
+shift in adversarial training explains the degradation of gradient similarity.
+Building on these insights, we explore the impacts of data augmentation and
+gradient regularization on transferability and identify that the trade-off
+generally exists in the various training mechanisms, thus building a
+comprehensive blueprint for the regulation mechanism behind transferability.
+Finally, we provide a general route for constructing better surrogates to boost
+transferability which optimizes both model smoothness and gradient similarity
+simultaneously, e.g., the combination of input gradient regularization and
+sharpness-aware minimization (SAM), validated by extensive experiments. In
+summary, we call for attention to the united impacts of these two factors for
+launching effective transfer attacks, rather than optimizing one while ignoring
+the other, and emphasize the crucial role of manipulating surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21
+  pages, 11 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Voting-Stacking Ensemble of Inception Networks for Cervical Cytology
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02781v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02781v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyi Qian, Qian Huang, Yulin Chen, Junzhou Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cervical cancer is one of the most severe diseases threatening women's
+health. Early detection and diagnosis can significantly reduce cancer risk, in
+which cervical cytology classification is indispensable. Researchers have
+recently designed many networks for automated cervical cancer diagnosis, but
+the limited accuracy and bulky size of these individual models cannot meet
+practical application needs. To address this issue, we propose a
+Voting-Stacking ensemble strategy, which employs three Inception networks as
+base learners and integrates their outputs through a voting ensemble. The
+samples misclassified by the ensemble model generate a new training set on
+which a linear classification model is trained as the meta-learner and performs
+the final predictions. In addition, a multi-level Stacking ensemble framework
+is designed to improve performance further. The method is evaluated on the
+SIPakMed, Herlev, and Mendeley datasets, achieving accuracies of 100%, 100%,
+and 100%, respectively. The experimental results outperform the current
+state-of-the-art (SOTA) methods, demonstrating its potential for reducing
+screening workload and helping pathologists detect cervical cancer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inherently Interpretable Multi-Label Classification Using Class-Specific
+  Counterfactuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Stefano Woerner, Andreas Maier, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability is essential for machine learning algorithms in high-stakes
+application fields such as medical image analysis. However, high-performing
+black-box neural networks do not provide explanations for their predictions,
+which can lead to mistrust and suboptimal human-ML collaboration. Post-hoc
+explanation techniques, which are widely used in practice, have been shown to
+suffer from severe conceptual problems. Furthermore, as we show in this paper,
+current explanation techniques do not perform adequately in the multi-label
+scenario, in which multiple medical findings may co-occur in a single image. We
+propose Attri-Net, an inherently interpretable model for multi-label
+classification. Attri-Net is a powerful classifier that provides transparent,
+trustworthy, and human-understandable explanations. The model first generates
+class-specific attribution maps based on counterfactuals to identify which
+image regions correspond to certain medical findings. Then a simple logistic
+regression classifier is used to make predictions based solely on these
+attribution maps. We compare Attri-Net to five post-hoc explanation techniques
+and one inherently interpretable classifier on three chest X-ray datasets. We
+find that Attri-Net produces high-quality multi-label explanations consistent
+with clinical knowledge and has comparable classification performance to
+state-of-the-art classification models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MIDL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Genie: Show Me the Data for Quantization <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkweon Jeon, Chungman Lee, Ho-young Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot quantization is a promising approach for developing lightweight
+deep neural networks when data is inaccessible owing to various reasons,
+including cost and issues related to privacy. By exploiting the learned
+parameters ($\mu$ and $\sigma$) of batch normalization layers in an
+FP32-pre-trained model, zero-shot quantization schemes focus on generating
+synthetic data. Subsequently, they distill knowledge from the pre-trained model
+(teacher) to the quantized model (student) such that the quantized model can be
+optimized with the synthetic dataset. However, thus far, zero-shot quantization
+has primarily been discussed in the context of quantization-aware training
+methods, which require task-specific losses and long-term optimization as much
+as retraining. We thus introduce a post-training quantization scheme for
+zero-shot quantization that produces high-quality quantized networks within a
+few hours. Furthermore, we propose a framework called Genie~that generates data
+suited for quantization. With the data synthesized by Genie, we can produce
+robust quantized models without real datasets, which is comparable to few-shot
+quantization. We also propose a post-training quantization algorithm to enhance
+the performance of quantized models. By combining them, we can bridge the gap
+between zero-shot and few-shot quantization while significantly improving the
+quantization performance compared to that of existing approaches. In other
+words, we can obtain a unique state-of-the-art zero-shot quantization approach.
+The code is available at \url{https://github.com/SamsungLabs/Genie}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. The code, prompts, and auxiliary text dataset is
+available at https://github.com/mayug/VDT-Adapter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ICCV-W 2023. V2 contains additional comparisons
+  with concurrent works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AesPA-Net: Aesthetic Pattern-Aware Style Transfer Networks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09724v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09724v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kibeom Hong, Seogkyu Jeon, Junsoo Lee, Namhyuk Ahn, Kunhee Kim, Pilhyeon Lee, Daesik Kim, Youngjung Uh, Hyeran Byun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To deliver the artistic expression of the target style, recent studies
+exploit the attention mechanism owing to its ability to map the local patches
+of the style image to the corresponding patches of the content image. However,
+because of the low semantic correspondence between arbitrary content and
+artworks, the attention module repeatedly abuses specific local patches from
+the style image, resulting in disharmonious and evident repetitive artifacts.
+To overcome this limitation and accomplish impeccable artistic style transfer,
+we focus on enhancing the attention mechanism and capturing the rhythm of
+patterns that organize the style. In this paper, we introduce a novel metric,
+namely pattern repeatability, that quantifies the repetition of patterns in the
+style image. Based on the pattern repeatability, we propose Aesthetic
+Pattern-Aware style transfer Networks (AesPA-Net) that discover the sweet spot
+of local and global style expressions. In addition, we propose a novel
+self-supervisory task to encourage the attention mechanism to learn precise and
+meaningful semantic correspondence. Lastly, we introduce the patch-wise style
+loss to transfer the elaborate rhythm of local patterns. Through qualitative
+and quantitative evaluations, we verify the reliability of the proposed pattern
+repeatability that aligns with human perception, and demonstrate the
+superiority of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code is available at this
+  https://github.com/Kibeom-Hong/AesPA-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling Face Verification Edge Cases: In-Depth Analysis and
+  Human-Machine Fusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08134v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08134v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Knoche, Gerhard Rigoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, face recognition systems surpass human performance on several
+datasets. However, there are still edge cases that the machine can't correctly
+classify. This paper investigates the effect of a combination of machine and
+human operators in the face verification task. First, we look closer at the
+edge cases for several state-of-the-art models to discover common datasets'
+challenging settings. Then, we conduct a study with 60 participants on these
+selected tasks with humans and provide an extensive analysis. Finally, we
+demonstrate that combining machine and human decisions can further improve the
+performance of state-of-the-art face verification systems on various benchmark
+datasets. Code and data are publicly available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MS-DETR: Multispectral Pedestrian Detection <span class="highlight-title">Transformer</span> with Loosely
+  Coupled Fusion and Modality-Balanced Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00290v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00290v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghui Xing, Song Wang, Shizhou Zhang, Guoqiang Liang, Xiuwei Zhang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multispectral pedestrian detection is an important task for many
+around-the-clock applications, since the visible and thermal modalities can
+provide complementary information especially under low light conditions. Most
+of the available multispectral pedestrian detectors are based on non-end-to-end
+detectors, while in this paper, we propose MultiSpectral pedestrian DEtection
+TRansformer (MS-DETR), an end-to-end multispectral pedestrian detector, which
+extends DETR into the field of multi-modal detection. MS-DETR consists of two
+modality-specific backbones and Transformer encoders, followed by a multi-modal
+Transformer decoder, and the visible and thermal features are fused in the
+multi-modal Transformer decoder. To well resist the misalignment between
+multi-modal images, we design a loosely coupled fusion strategy by sparsely
+sampling some keypoints from multi-modal features independently and fusing them
+with adaptively learned attention weights. Moreover, based on the insight that
+not only different modalities, but also different pedestrian instances tend to
+have different confidence scores to final detection, we further propose an
+instance-aware modality-balanced optimization strategy, which preserves visible
+and thermal decoder branches and aligns their predicted slots through an
+instance-wise dynamic loss. Our end-to-end MS-DETR shows superior performance
+on the challenging KAIST, CVC-14 and LLVIP benchmark datasets. The source code
+is available at https://github.com/YinghuiXing/MS-DETR .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Density-invariant Features for Distant Point Cloud Registration <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Liu, Hongzi Zhu, Yunsong Zhou, Hongyang Li, Shan Chang, Minyi Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Registration of distant outdoor LiDAR point clouds is crucial to extending
+the 3D vision of collaborative autonomous vehicles, and yet is challenging due
+to small overlapping area and a huge disparity between observed point
+densities. In this paper, we propose Group-wise Contrastive Learning (GCL)
+scheme to extract density-invariant geometric features to register distant
+outdoor LiDAR point clouds. We mark through theoretical analysis and
+experiments that, contrastive positives should be independent and identically
+distributed (i.i.d.), in order to train densityinvariant feature extractors. We
+propose upon the conclusion a simple yet effective training scheme to force the
+feature of multiple point clouds in the same spatial location (referred to as
+positive groups) to be similar, which naturally avoids the sampling bias
+introduced by a pair of point clouds to conform with the i.i.d. principle. The
+resulting fully-convolutional feature extractor is more powerful and
+density-invariant than state-of-the-art methods, improving the registration
+recall of distant scenarios on KITTI and nuScenes benchmarks by 40.9% and
+26.9%, respectively. Code is available at https://github.com/liuQuan98/GCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the IEEE/CVF International Conference on Computer
+  Vision (ICCV), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiking Neural Networks for event-based action recognition: A new task
+  to understand their advantage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Vicente-Sola, Davide L. Manna, Paul Kirkland, Gaetano Di Caterina, Trevor Bihl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNN) are characterised by their unique temporal
+dynamics, but the properties and advantages of such computations are still not
+well understood. In order to provide answers, in this work we demonstrate how
+Spiking neurons can enable temporal feature extraction in feed-forward neural
+networks without the need for recurrent synapses, showing how their
+bio-inspired computing principles can be successfully exploited beyond energy
+efficiency gains and evidencing their differences with respect to conventional
+neurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain
+(DVS-GC), which allows, for the first time, to evaluate the perception of
+temporal dependencies in a real event-based action recognition dataset. Our
+study proves how the widely used DVS Gesture benchmark could be solved by
+networks without temporal feature extraction, unlike the new DVS-GC which
+demands an understanding of the ordering of the events. Furthermore, this setup
+allowed us to unveil the role of the leakage rate in spiking neurons for
+temporal processing tasks and demonstrated the benefits of "hard reset"
+mechanisms. Additionally, we also show how time-dependent weights and
+normalization can lead to understanding order by means of temporal attention.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New article superseding the one in previous versions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Diffusion Probabilistic Models for Generation of Realistic
+  Fully-Annotated Microscopy Image Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dennis Eschweiler, Rüveyda Yilmaz, Matisse Baumann, Ina Laube, Rijo Roy, Abin Jose, Daniel Brückner, Johannes Stegmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in computer vision have led to significant progress in the
+generation of realistic image data, with denoising diffusion probabilistic
+models proving to be a particularly effective method. In this study, we
+demonstrate that diffusion models can effectively generate fully-annotated
+microscopy image data sets through an unsupervised and intuitive approach,
+using rough sketches of desired structures as the starting point. The proposed
+pipeline helps to reduce the reliance on manual annotations when training deep
+learning-based segmentation approaches and enables the segmentation of diverse
+datasets without the need for human annotations. This approach holds great
+promise in streamlining the data generation process and enabling a more
+efficient and scalable training of segmentation models, as we show in the
+example of different practical experiments involving various organisms and cell
+types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-manipulation of soft-materials estimating deformation from depth
+  images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05609v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05609v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Nicola, Enrico Villagrossi, Nicola Pedrocchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-robot co-manipulation of soft materials, such as fabrics, composites,
+and sheets of paper/cardboard, is a challenging operation that presents several
+relevant industrial applications. Estimating the deformation state of the
+co-manipulated material is one of the main challenges. Viable methods provide
+the indirect measure by calculating the human-robot relative distance. In this
+paper, we develop a data-driven model to estimate the deformation state of the
+material from a depth image through a Convolutional Neural Network (CNN).
+First, we define the deformation state of the material as the relative
+roto-translation from the current robot pose and a human grasping position. The
+model estimates the current deformation state through a Convolutional Neural
+Network, specifically a DenseNet-121 pretrained on ImageNet.The delta between
+the current and the desired deformation state is fed to the robot controller
+that outputs twist commands. The paper describes the developed approach to
+acquire, preprocess the dataset and train the model. The model is compared with
+the current state-of-the-art method based on a skeletal tracker from cameras.
+Results show that our approach achieves better performances and avoids the
+various drawbacks caused by using a skeletal tracker.Finally, we also studied
+the model performance according to different architectures and dataset
+dimensions to minimize the time required for dataset acquisition
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print, Accepted to Robotics and Computer Integrated Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Document Layout Annotation: Database and Benchmark in the Domain of
+  Public Affairs <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Peña, Aythami Morales, Julian Fierrez, Javier Ortega-Garcia, Marcos Grande, Iñigo Puente, Jorge Cordova, Gonzalo Cordova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Every day, thousands of digital documents are generated with useful
+information for companies, public organizations, and citizens. Given the
+impossibility of processing them manually, the automatic processing of these
+documents is becoming increasingly necessary in certain sectors. However, this
+task remains challenging, since in most cases a text-only based parsing is not
+enough to fully understand the information presented through different
+components of varying significance. In this regard, Document Layout Analysis
+(DLA) has been an interesting research field for many years, which aims to
+detect and classify the basic components of a document. In this work, we used a
+procedure to semi-automatically annotate digital documents with different
+layout labels, including 4 basic layout blocks and 4 text categories. We apply
+this procedure to collect a novel database for DLA in the public affairs
+domain, using a set of 24 data sources from the Spanish Administration. The
+database comprises 37.9K documents with more than 441K document pages, and more
+than 8M labels associated to 8 layout block units. The results of our
+experiments validate the proposed text labeling procedure with accuracy up to
+99%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for
+  Document Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generation of Realistic Synthetic Raw Radar Data for Automated Driving
+  Applications using Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo C. Fidelis, Fabio Reway, Herick Y. S. Ribeiro, Pietro L. Campos, Werner Huber, Christian Icking, Lester A. Faria, Torsten Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main approaches for simulating FMCW radar are based on ray tracing, which
+is usually computationally intensive and do not account for background noise.
+This work proposes a faster method for FMCW radar simulation capable of
+generating synthetic raw radar data using generative adversarial networks
+(GAN). The code and pre-trained weights are open-source and available on
+GitHub. This method generates 16 simultaneous chirps, which allows the
+generated data to be used for the further development of algorithms for
+processing radar data (filtering and clustering). This can increase the
+potential for data augmentation, e.g., by generating data in non-existent or
+safety-critical scenarios that are not reproducible in real life. In this work,
+the GAN was trained with radar measurements of a motorcycle and used to
+generate synthetic raw radar data of a motorcycle traveling in a straight line.
+For generating this data, the distance of the motorcycle and Gaussian noise are
+used as input to the neural network. The synthetic generated radar chirps were
+evaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth
+(RA) map is calculated twice: first, based on synthetic data using this GAN
+and, second, based on real data. Based on these RA maps, an algorithm with
+adaptive threshold and edge detection is used for object detection. The results
+have shown that the data is realistic in terms of coherent radar reflections of
+the motorcycle and background noise based on the comparison of chirps, the RA
+maps and the object detection results. Thus, the proposed method in this work
+has shown to minimize the simulation-to-reality gap for the generation of radar
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A lightweight target detection algorithm based on Mobilenet Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2002.03729v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2002.03729v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengquan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target detection algorithm based on deep learning needs high computer GPU
+configuration, even need to use high performance deep learning workstation,
+this not only makes the cost increase, also greatly limits the realizability of
+the ground, this paper introduces a kind of lightweight algorithm for target
+detection under the condition of the balance accuracy and computational
+efficiency, MobileNet as Backbone performs parameter The processing speed is
+30fps on the RTX2060 card for images with the CNN separator layer. The
+processing speed is 30fps on the RTX2060 card for images with a resolution of
+320*320.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UMD: Unsupervised Model Detection for X2X Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18651v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18651v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xiang, Zidi Xiong, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor (Trojan) attack is a common threat to deep neural networks, where
+samples from one or more source classes embedded with a backdoor trigger will
+be misclassified to adversarial target classes. Existing methods for detecting
+whether a classifier is backdoor attacked are mostly designed for attacks with
+a single adversarial target (e.g., all-to-one attack). To the best of our
+knowledge, without supervision, no existing methods can effectively address the
+more general X2X attack with an arbitrary number of source classes, each paired
+with an arbitrary target class. In this paper, we propose UMD, the first
+Unsupervised Model Detection method that effectively detects X2X backdoor
+attacks via a joint inference of the adversarial (source, target) class pairs.
+In particular, we first define a novel transferability statistic to measure and
+select a subset of putative backdoor class pairs based on a proposed clustering
+approach. Then, these selected class pairs are jointly assessed based on an
+aggregation of their reverse-engineered trigger size for detection inference,
+using a robust and unsupervised anomaly detector we proposed. We conduct
+comprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show
+that our unsupervised UMD outperforms SOTA detectors (even with supervision) by
+17%, 4%, and 8%, respectively, in terms of the detection accuracy against
+diverse X2X attacks. We also show the strong detection performance of UMD
+against several strong adaptive attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 40th International Conference on Machine Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Swin-<span class="highlight-title">transformer</span>-yolov5 For Real-time Wine Grape Bunch Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.14508v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.14508v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenglian Lu, Xiaoyu Liu, Zixaun He, Wenbo Liu, Xin Zhang, Manoj Karkee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research, an integrated detection model, Swin-transformer-YOLOv5 or
+Swin-T-YOLOv5, was proposed for real-time wine grape bunch detection to inherit
+the advantages from both YOLOv5 and Swin-transformer. The research was
+conducted on two different grape varieties of Chardonnay (always white berry
+skin) and Merlot (white or white-red mix berry skin when immature; red when
+matured) from July to September in 2019. To verify the superiority of
+Swin-T-YOLOv5, its performance was compared against several commonly
+used/competitive object detectors, including Faster R-CNN, YOLOv3, YOLOv4, and
+YOLOv5. All models were assessed under different test conditions, including two
+different weather conditions (sunny and cloudy), two different berry maturity
+stages (immature and mature), and three different sunlight
+directions/intensities (morning, noon, and afternoon) for a comprehensive
+comparison. Additionally, the predicted number of grape bunches by
+Swin-T-YOLOv5 was further compared with ground truth values, including both
+in-field manual counting and manual labeling during the annotation process.
+Results showed that the proposed Swin-T-YOLOv5 outperformed all other studied
+models for grape bunch detection, with up to 97% of mean Average Precision
+(mAP) and 0.89 of F1-score when the weather was cloudy. This mAP was
+approximately 44%, 18%, 14%, and 4% greater than Faster R-CNN, YOLOv3, YOLOv4,
+and YOLOv5, respectively. Swin-T-YOLOv5 achieved its lowest mAP (90%) and
+F1-score (0.82) when detecting immature berries, where the mAP was
+approximately 40%, 5%, 3%, and 1% greater than the same. Furthermore,
+Swin-T-YOLOv5 performed better on Chardonnay variety with achieved up to 0.91
+of R2 and 2.36 root mean square error (RMSE) when comparing the predictions
+with ground truth. However, it underperformed on Merlot variety with achieved
+only up to 0.70 of R2 and 3.30 of RMSE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages; 15 figures;Corresponding author: Xin Zhang Department of
+  Agricultural and Biological Engineering Mississippi State University
+  Mississippi State, MS 39762, USA (xzhang@abe.msstate.edu)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Industrial Image Anomaly Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11514v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11514v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Liu, Guoyang Xie, Jingbao Wang, Shangnian Li, Chengjie Wang, Feng Zheng, Yaochu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent rapid development of deep learning has laid a milestone in
+industrial Image Anomaly Detection (IAD). In this paper, we provide a
+comprehensive review of deep learning-based image anomaly detection techniques,
+from the perspectives of neural network architectures, levels of supervision,
+loss functions, metrics and datasets. In addition, we extract the new setting
+from industrial manufacturing and review the current IAD approaches under our
+proposed our new setting. Moreover, we highlight several opening challenges for
+image anomaly detection. The merits and downsides of representative network
+architectures under varying supervision are discussed. Finally, we summarize
+the research findings and point out future research directions. More resources
+are available at
+https://github.com/M-3LAB/awesome-industrial-anomaly-detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SemARFlow: Injecting Semantics into Unsupervised Optical Flow Estimation
+  for Autonomous Driving <span class="chip">ICCV-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06209v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06209v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yuan, Shuzhi Yu, Hannah Kim, Carlo Tomasi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised optical flow estimation is especially hard near occlusions and
+motion boundaries and in low-texture regions. We show that additional
+information such as semantics and domain knowledge can help better constrain
+this problem. We introduce SemARFlow, an unsupervised optical flow network
+designed for autonomous driving data that takes estimated semantic segmentation
+masks as additional inputs. This additional information is injected into the
+encoder and into a learned upsampler that refines the flow output. In addition,
+a simple yet effective semantic augmentation module provides self-supervision
+when learning flow and its boundaries for vehicles, poles, and sky. Together,
+these injections of semantic information improve the KITTI-2015 optical flow
+test error rate from 11.80% to 8.38%. We also show visible improvements around
+object boundaries as well as a greater ability to generalize across datasets.
+Code is available at
+https://github.com/duke-vision/semantic-unsup-flow-release.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV-2023; Code is available at
+  https://github.com/duke-vision/semantic-unsup-flow-release</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14016v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14016v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Shen, Jianlong Jin, Ruixin Zhang, Huaen Li, Kai Zhao, Yingyi Zhang, Jingyun Zhang, Shouhong Ding, Yang Zhao, Wei Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Palmprint recently shows great potential in recognition applications as it is
+a privacy-friendly and stable biometric. However, the lack of large-scale
+public palmprint datasets limits further research and development of palmprint
+recognition. In this paper, we propose a novel realistic pseudo-palmprint
+generation (RPG) model to synthesize palmprints with massive identities. We
+first introduce a conditional modulation generator to improve the intra-class
+diversity. Then an identity-aware loss is proposed to ensure identity
+consistency against unpaired training. We further improve the B\'ezier palm
+creases generation strategy to guarantee identity independence. Extensive
+experimental results demonstrate that synthetic pretraining significantly
+boosts the recognition model performance. For example, our model improves the
+state-of-the-art B\'ezierPalm by more than $5\%$ and $14\%$ in terms of
+TAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only
+$10\%$ of the real training data, our method still outperforms ArcFace with
+$100\%$ real training data, indicating that we are closer to real-data-free
+palmprint recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lifting-based variational multiclass segmentation: design, analysis and
+  implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.04680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.04680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadja Gruber, Johannes Schwab, Sebastien Court, Elke Gizewski, Markus Haltmeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose, analyze and realize a variational multiclass segmentation scheme
+that partitions a given image into multiple regions exhibiting specific
+properties. Our method determines multiple functions that encode the
+segmentation regions by minimizing an energy functional combining information
+from different channels. Multichannel image data can be obtained by lifting the
+image into a higher dimensional feature space using specific multichannel
+filtering or may already be provided by the imaging modality under
+consideration, such as an RGB image or multimodal medical data. Experimental
+results show that the proposed method performs well in various scenarios. In
+particular, promising results are presented for two medical applications
+involving classification of brain abscess and tumor growth, respectively. As
+main theoretical contributions, we prove the existence of global minimizers of
+the proposed energy functional and show its stability and convergence with
+respect to noisy inputs. In particular, these results also apply to the special
+case of binary segmentation, and these results are also novel in this
+particular situation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">survey</span> of the Vision <span class="highlight-title">Transformer</span>s and its CNN-<span class="highlight-title">Transformer</span> based
+  Variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09880v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09880v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asifullah Khan, Zunaira Rauf, Anabia Sohail, Abdul Rehman, Hifsa Asif, Aqsa Asif, Umair Farooq
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers have become popular as a possible substitute to
+convolutional neural networks (CNNs) for a variety of computer vision
+applications. These transformers, with their ability to focus on global
+relationships in images, offer large learning capacity. However, they may
+suffer from limited generalization as they do not tend to model local
+correlation in images. Recently, in vision transformers hybridization of both
+the convolution operation and self-attention mechanism has emerged, to exploit
+both the local and global image representations. These hybrid vision
+transformers, also referred to as CNN-Transformer architectures, have
+demonstrated remarkable results in vision applications. Given the rapidly
+growing number of hybrid vision transformers, it has become necessary to
+provide a taxonomy and explanation of these hybrid architectures. This survey
+presents a taxonomy of the recent vision transformer architectures and more
+specifically that of the hybrid vision transformers. Additionally, the key
+features of these architectures such as the attention mechanisms, positional
+embeddings, multi-scale processing, and convolution are also discussed. In
+contrast to the previous survey papers that are primarily focused on individual
+vision transformer architectures or CNNs, this survey uniquely emphasizes the
+emerging trend of hybrid vision transformers. By showcasing the potential of
+hybrid vision transformers to deliver exceptional performance across a range of
+computer vision tasks, this survey sheds light on the future directions of this
+rapidly evolving architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pages: 58, Figures: 14</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01006v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01006v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengju Ye, Wei Jing, Chunyong Hu, Shikun Huang, Lingping Gao, Fangzhen Li, Jingke Wang, Ke Guo, Wencong Xiao, Weibo Mao, Hang Zheng, Kun Li, Junbo Chen, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building a multi-modality multi-task neural network toward accurate and
+robust performance is a de-facto standard in perception task of autonomous
+driving. However, leveraging such data from multiple sensors to jointly
+optimize the prediction and planning tasks remains largely unexplored. In this
+paper, we present FusionAD, to the best of our knowledge, the first unified
+framework that fuse the information from two most critical sensors, camera and
+LiDAR, goes beyond perception task. Concretely, we first build a transformer
+based multi-modality fusion network to effectively produce fusion based
+features. In constrast to camera-based end-to-end method UniAD, we then
+establish a fusion aided modality-aware prediction and status-aware planning
+modules, dubbed FMSPnP that take advantages of multi-modality features. We
+conduct extensive experiments on commonly used benchmark nuScenes dataset, our
+FusionAD achieves state-of-the-art performance and surpassing baselines on
+average 15% on perception tasks like detection and tracking, 10% on occupancy
+prediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score
+and reduces the collision rate from 0.31% to only 0.12%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Coreset Selection for Efficient Robust Training <span class="chip">ECCV2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi M. Dolatabadi, Sarah Erfani, Christopher Leckie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are vulnerable to adversarial attacks: adding well-crafted,
+imperceptible perturbations to their input can modify their output. Adversarial
+training is one of the most effective approaches to training robust models
+against such attacks. Unfortunately, this method is much slower than vanilla
+training of neural networks since it needs to construct adversarial examples
+for the entire training data at every iteration. By leveraging the theory of
+coreset selection, we show how selecting a small subset of training data
+provides a principled approach to reducing the time complexity of robust
+training. To this end, we first provide convergence guarantees for adversarial
+coreset selection. In particular, we show that the convergence bound is
+directly related to how well our coresets can approximate the gradient computed
+over the entire training data. Motivated by our theoretical analysis, we
+propose using this gradient approximation error as our adversarial coreset
+selection objective to reduce the training set size effectively. Once built, we
+run adversarial training over this subset of the training data. Unlike existing
+methods, our approach can be adapted to a wide variety of training objectives,
+including TRADES, $\ell_p$-PGD, and Perceptual Adversarial Training. We conduct
+extensive experiments to demonstrate that our approach speeds up adversarial
+training by 2-3 times while experiencing a slight degradation in the clean and
+robust accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the International Journal of Computer Vision (IJCV).
+  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:
+  substantial text overlap with arXiv:2112.00378</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tailed Recognition by Mutual Information Maximization between
+  Latent Features and Ground-Truth Labels <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min-Kook Suh, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although contrastive learning methods have shown prevailing performance on a
+variety of representation learning tasks, they encounter difficulty when the
+training dataset is long-tailed. Many researchers have combined contrastive
+learning and a logit adjustment technique to address this problem, but the
+combinations are done ad-hoc and a theoretical background has not yet been
+provided. The goal of this paper is to provide the background and further
+improve the performance. First, we show that the fundamental reason contrastive
+learning methods struggle with long-tailed tasks is that they try to maximize
+the mutual information maximization between latent features and input data. As
+ground-truth labels are not considered in the maximization, they are not able
+to address imbalances between class labels. Rather, we interpret the
+long-tailed recognition task as a mutual information maximization between
+latent features and ground-truth labels. This approach integrates contrastive
+learning and logit adjustment seamlessly to derive a loss function that shows
+state-of-the-art performance on long-tailed recognition benchmarks. It also
+demonstrates its efficacy in image segmentation tasks, verifying its
+versatility beyond image classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feature Decoupling-Recycling Network for Fast Interactive Segmentation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03529v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03529v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huimin Zeng, Weinong Wang, Xin Tao, Zhiwei Xiong, Yu-Wing Tai, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent interactive segmentation methods iteratively take source image, user
+guidance and previously predicted mask as the input without considering the
+invariant nature of the source image. As a result, extracting features from the
+source image is repeated in each interaction, resulting in substantial
+computational redundancy. In this work, we propose the Feature
+Decoupling-Recycling Network (FDRN), which decouples the modeling components
+based on their intrinsic discrepancies and then recycles components for each
+user interaction. Thus, the efficiency of the whole interactive process can be
+significantly improved. To be specific, we apply the Decoupling-Recycling
+strategy from three perspectives to address three types of discrepancies,
+respectively. First, our model decouples the learning of source image semantics
+from the encoding of user guidance to process two types of input domains
+separately. Second, FDRN decouples high-level and low-level features from
+stratified semantic representations to enhance feature learning. Third, during
+the encoding of user guidance, current user guidance is decoupled from
+historical guidance to highlight the effect of current user guidance. We
+conduct extensive experiments on 6 datasets from different domains and
+modalities, which demonstrate the following merits of our model: 1) superior
+efficiency than other methods, particularly advantageous in challenging
+scenarios requiring long-term interactions (up to 4.25x faster), while
+achieving favorable segmentation performance; 2) strong applicability to
+various methods serving as a universal enhancement technique; 3) well
+cross-task generalizability, e.g., to medical image segmentation, and
+robustness against misleading user guidance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Few-shot 3D Point Cloud Segmentation via Query-Guided
+  Enhancement <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhua Ning, Zhuotao Tian, Guangming Lu, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although extensive research has been conducted on 3D point cloud
+segmentation, effectively adapting generic models to novel categories remains a
+formidable challenge. This paper proposes a novel approach to improve point
+cloud few-shot segmentation (PC-FSS) models. Unlike existing PC-FSS methods
+that directly utilize categorical information from support prototypes to
+recognize novel classes in query samples, our method identifies two critical
+aspects that substantially enhance model performance by reducing contextual
+gaps between support prototypes and query features. Specifically, we (1) adapt
+support background prototypes to match query context while removing extraneous
+cues that may obscure foreground and background in query samples, and (2)
+holistically rectify support prototypes under the guidance of query features to
+emulate the latter having no semantic gap to the query targets. Our proposed
+designs are agnostic to the feature extractor, rendering them readily
+applicable to any prototype-based methods. The experimental results on S3DIS
+and ScanNet demonstrate notable practical benefits, as our approach achieves
+significant improvements while still maintaining high efficiency. The code for
+our approach is available at
+https://github.com/AaronNZH/Boosting-Few-shot-3D-Point-Cloud-Segmentation-via-Query-Guided-Enhancement
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Learnable Multi-Scale Feature Compression for VCM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16670v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16670v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeongwoong Kim, Hyewon Jeong, Janghyun Yu, Younhee Kim, Jooyoung Lee, Se Yoon Jeong, Hui Yong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of deep learning-based machine vision applications has
+given rise to a new type of compression, so called video coding for machine
+(VCM). VCM differs from traditional video coding in that it is optimized for
+machine vision performance instead of human visual quality. In the feature
+compression track of MPEG-VCM, multi-scale features extracted from images are
+subject to compression. Recent feature compression works have demonstrated that
+the versatile video coding (VVC) standard-based approach can achieve a BD-rate
+reduction of up to 96% against MPEG-VCM feature anchor. However, it is still
+sub-optimal as VVC was not designed for extracted features but for natural
+images. Moreover, the high encoding complexity of VVC makes it difficult to
+design a lightweight encoder without sacrificing performance. To address these
+challenges, we propose a novel multi-scale feature compression method that
+enables both the end-to-end optimization on the extracted features and the
+design of lightweight encoders. The proposed model combines a learnable
+compressor with a multi-scale feature fusion network so that the redundancy in
+the multi-scale features is effectively removed. Instead of simply cascading
+the fusion network and the compression network, we integrate the fusion and
+encoding processes in an interleaved way. Our model first encodes a
+larger-scale feature to obtain a latent representation and then fuses the
+latent with a smaller-scale feature. This process is successively performed
+until the smallest-scale feature is fused and then the encoded latent at the
+final stage is entropy-coded for transmission. The results show that our model
+outperforms previous approaches by at least 52% BD-rate reduction and has
+$\times5$ to $\times27$ times less encoding time for object detection...
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, accepted by IEEE Transactions on Circuits and Systems for
+  Video Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lawin <span class="highlight-title">Transformer</span>: Improving New-Era Vision Backbones with Multi-Scale
+  Representations for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.01615v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.01615v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Yan, Chuang Zhang, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multi-level aggregation (MLA) module has emerged as a critical component
+for advancing new-era vision back-bones in semantic segmentation. In this
+paper, we propose Lawin (large window) Transformer, a novel MLA architecture
+that creatively utilizes multi-scale feature maps from the vision backbone. At
+the core of Lawin Transformer is the Lawin attention, a newly designed window
+attention mechanism capable of querying much larger context windows than local
+windows. We focus on studying the efficient and simplistic application of the
+large-window paradigm, allowing for flexible regulation of the ratio of large
+context to query and capturing multi-scale representations. We validate the
+effectiveness of Lawin Transformer on Cityscapes and ADE20K, consistently
+demonstrating great superiority to widely-used MLA modules when combined with
+new-era vision backbones. The code is available at
+https://github.com/yan-hao-tian/lawin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The latest version has really big differences from the original
+  version, which may make the reader confused. We will submit the latest
+  version as another article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Screen-based 3D Subjective Experiment Software 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Fan, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn
+considerable efforts from academia and industry to assess their perceptual
+quality by conducting subjective experiments. However, lacking a handy software
+for 3D subjective experiments complicates the construction of 3D graphics
+quality assessment datasets, thus hindering the prosperity of relevant fields.
+In this paper, we develop a powerful platform with which users can flexibly
+design their 3D subjective methodologies and build high-quality datasets,
+easing a broad spectrum of 3D graphics subjective quality study. To accurately
+illustrate the perceptual quality differences of 3D stimuli, our software can
+simultaneously render the source stimulus and impaired stimulus and allows both
+stimuli to respond synchronously to viewer interactions. Compared with amateur
+3D visualization tool-based or image/video rendering-based schemes, our
+approach embodies typical 3D applications while minimizing cognitive overload
+during subjective experiments. We organized a subjective experiment involving
+40 participants to verify the validity of the proposed software. Experimental
+analyses demonstrate that subjective tests on our software can produce
+reasonable subjective quality scores of 3D models. All resources in this paper
+can be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two-Stream Regression Network for Dental Implant Position Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10044v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10044v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinquan Yang, Xuguang Li, Xuechen Li, Wenting Chen, Linlin Shen, Xin Li, Yongqiang Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In implant prosthesis treatment, the design of the surgical guide heavily
+relies on the manual location of the implant position, which is subjective and
+prone to doctor's experiences. When deep learning based methods has started to
+be applied to address this problem, the space between teeth are various and
+some of them might present similar texture characteristic with the actual
+implant region. Both problems make a big challenge for the implant position
+prediction. In this paper, we develop a two-stream implant position regression
+framework (TSIPR), which consists of an implant region detector (IRD) and a
+multi-scale patch embedding regression network (MSPENet), to address this
+issue. For the training of IRD, we extend the original annotation to provide
+additional supervisory information, which contains much more rich
+characteristic and do not introduce extra labeling costs. A multi-scale patch
+embedding module is designed for the MSPENet to adaptively extract features
+from the images with various tooth spacing. The global-local feature
+interaction block is designed to build the encoder of MSPENet, which combines
+the transformer and convolution for enriched feature representation. During
+inference, the RoI mask extracted from the IRD is used to refine the prediction
+results of the MSPENet. Extensive experiments on a dental implant dataset
+through five-fold cross-validation demonstrated that the proposed TSIPR
+achieves superior performance than existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Damage Vision Mining Opportunity for Imbalanced Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12676v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12676v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yasuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In past decade, previous balanced datasets have been used to advance
+algorithms for classification, object detection, semantic segmentation, and
+anomaly detection in industrial applications. Specifically, for condition-based
+maintenance, automating visual inspection is crucial to ensure high quality.
+Deterioration prognostic attempts to optimize the fine decision process for
+predictive maintenance and proactive repair. In civil infrastructure and living
+environment, damage data mining cannot avoid the imbalanced data issue because
+of rare unseen events and high quality status by improved operations. For
+visual inspection, deteriorated class acquired from the surface of concrete and
+steel components are occasionally imbalanced. From numerous related surveys, we
+summarize that imbalanced data problems can be categorized into four types; 1)
+missing range of target and label valuables, 2) majority-minority class
+imbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class
+of pixel-wise imbalance. Since 2015, there has been many imbalanced studies
+using deep learning approaches that includes regression, image classification,
+object detection, semantic segmentation. However, anomaly detection for
+imbalanced data is not yet well known. In the study, we highlight one-class
+anomaly detection application whether anomalous class or not, and demonstrate
+clear examples on imbalanced vision datasets: blood smear, lung infection,
+hazardous driving, wooden, concrete deterioration, river sludge, and disaster
+damage. Illustrated in Fig.1, we provide key results on damage vision mining
+advantage, hypothesizing that the more effective range of positive ratio, the
+higher accuracy gain of anomaly detection application. In our imbalanced
+studies, compared with the balanced case of positive ratio 1/1, we find that
+there is applicable positive ratio, where the accuracy are consistently high.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 29 figures, 18 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptively Placed Multi-Grid Scene Representation Networks for
+  Large-Scale Data Visualization <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skylar Wolfgang Wurster, Tianyu Xiong, Han-Wei Shen, Hanqi Guo, Tom Peterka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene representation networks (SRNs) have been recently proposed for
+compression and visualization of scientific data. However, state-of-the-art
+SRNs do not adapt the allocation of available network parameters to the complex
+features found in scientific data, leading to a loss in reconstruction quality.
+We address this shortcoming with an adaptively placed multi-grid SRN (APMGSRN)
+and propose a domain decomposition training and inference technique for
+accelerated parallel training on multi-GPU systems. We also release an
+open-source neural volume rendering application that allows plug-and-play
+rendering with any PyTorch-based SRN. Our proposed APMGSRN architecture uses
+multiple spatially adaptive feature grids that learn where to be placed within
+the domain to dynamically allocate more neural network resources where error is
+high in the volume, improving state-of-the-art reconstruction accuracy of SRNs
+for scientific data without requiring expensive octree refining, pruning, and
+traversal like previous adaptive models. In our domain decomposition approach
+for representing large-scale data, we train an set of APMGSRNs in parallel on
+separate bricks of the volume to reduce training time while avoiding overhead
+necessary for an out-of-core solution for volumes too large to fit in GPU
+memory. After training, the lightweight SRNs are used for realtime neural
+volume rendering in our open-source renderer, where arbitrary view angles and
+transfer functions can be explored. A copy of this paper, all code, all models
+used in our experiments, and all supplemental materials and videos are
+available at https://github.com/skywolf829/APMGSRN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAE-DFER: Efficient Masked Autoencoder for <span class="highlight-title">Self-supervised</span> Dynamic
+  Facial Expression Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licai Sun, Zheng Lian, Bin Liu, Jianhua Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic facial expression recognition (DFER) is essential to the development
+of intelligent and empathetic machines. Prior efforts in this field mainly fall
+into supervised learning paradigm, which is severely restricted by the limited
+labeled data in existing datasets. Inspired by recent unprecedented success of
+masked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel
+self-supervised method which leverages large-scale self-supervised pre-training
+on abundant unlabeled data to largely advance the development of DFER. Since
+the vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial
+computation during fine-tuning, MAE-DFER develops an efficient local-global
+interaction Transformer (LGI-Former) as the encoder. Moreover, in addition to
+the standalone appearance content reconstruction in VideoMAE, MAE-DFER also
+introduces explicit temporal facial motion modeling to encourage LGI-Former to
+excavate both static appearance and dynamic motion information. Extensive
+experiments on six datasets show that MAE-DFER consistently outperforms
+state-of-the-art supervised methods by significant margins (e.g., +6.30\% UAR
+on DFEW and +8.34\% UAR on MAFW), verifying that it can learn powerful dynamic
+facial representations via large-scale self-supervised pre-training. Besides,
+it has comparable or even better performance than VideoMAE, while largely
+reducing the computational cost (about 38\% FLOPs). We believe MAE-DFER has
+paved a new way for the advancement of DFER and can inspire more relevant
+research in this field and even other related tasks. Codes and models are
+publicly available at https://github.com/sunlicai/MAE-DFER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 (camera ready). Codes and models are publicly available
+  at https://github.com/sunlicai/MAE-DFER</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatialyze: A Geospatial Video Analytics System with Spatial-Aware
+  Optimizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanwut Kittivorawong, Yongming Ge, Yousef Helal, Alvin Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videos that are shot using commodity hardware such as phones and surveillance
+cameras record various metadata such as time and location. We encounter such
+geospatial videos on a daily basis and such videos have been growing in volume
+significantly. Yet, we do not have data management systems that allow users to
+interact with such data effectively.
+  In this paper, we describe Spatialyze, a new framework for end-to-end
+querying of geospatial videos. Spatialyze comes with a domain-specific language
+where users can construct geospatial video analytic workflows using a 3-step,
+declarative, build-filter-observe paradigm. Internally, Spatialyze leverages
+the declarative nature of such workflows, the temporal-spatial metadata stored
+with videos, and physical behavior of real-world objects to optimize the
+execution of workflows. Our results using real-world videos and workflows show
+that Spatialyze can reduce execution time by up to 5.3x, while maintaining up
+to 97.1% accuracy compared to unoptimized execution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub Repository: https://github.com/apperception-db/spatialyze</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPTS v2: Single-Point Scene Text Spotting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01635v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01635v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuliang Liu, Jiaxin Zhang, Dezhi Peng, Mingxin Huang, Xinyu Wang, Jingqun Tang, Can Huang, Dahua Lin, Chunhua Shen, Xiang Bai, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end scene text spotting has made significant progress due to its
+intrinsic synergy between text detection and recognition. Previous methods
+commonly regard manual annotations such as horizontal rectangles, rotated
+rectangles, quadrangles, and polygons as a prerequisite, which are much more
+expensive than using single-point. Our new framework, SPTS v2, allows us to
+train high-performing text-spotting models using a single-point annotation.
+SPTS v2 reserves the advantage of the auto-regressive Transformer with an
+Instance Assignment Decoder (IAD) through sequentially predicting the center
+points of all text instances inside the same predicting sequence, while with a
+Parallel Recognition Decoder (PRD) for text recognition in parallel. These two
+decoders share the same parameters and are interactively connected with a
+simple but effective information transmission process to pass the gradient and
+information. Comprehensive experiments on various existing benchmark datasets
+demonstrate the SPTS v2 can outperform previous state-of-the-art single-point
+text spotters with fewer parameters while achieving 19$\times$ faster inference
+speed. Within the context of our SPTS v2 framework, our experiments suggest a
+potential preference for single-point representation in scene text spotting
+when compared to other representations. Such an attempt provides a significant
+opportunity for scene text spotting applications beyond the realms of existing
+paradigms. Code is available at https://github.com/Yuliang-Liu/SPTSv2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2112.07917</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Label <span class="highlight-title">Self-Supervised</span> Learning with Scene Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03286v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03286v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Zhu, Minghao Fu, Jianxin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) methods targeting scene images have seen a
+rapid growth recently, and they mostly rely on either a dedicated dense
+matching mechanism or a costly unsupervised object discovery module. This paper
+shows that instead of hinging on these strenuous operations, quality image
+representations can be learned by treating scene/multi-label image SSL simply
+as a multi-label classification problem, which greatly simplifies the learning
+framework. Specifically, multiple binary pseudo-labels are assigned for each
+input image by comparing its embeddings with those in two dictionaries, and the
+network is optimized using the binary cross entropy loss. The proposed method
+is named Multi-Label Self-supervised learning (MLS). Visualizations
+qualitatively show that clearly the pseudo-labels by MLS can automatically find
+semantically similar pseudo-positive pairs across different images to
+facilitate contrastive learning. MLS learns high quality representations on
+MS-COCO and achieves state-of-the-art results on classification, detection and
+segmentation benchmarks. At the same time, MLS is much simpler than existing
+methods, making it easier to deploy and for further exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Degeneration-Tuning: Using Scrambled Grid shield Unwanted Concepts from
+  Stable Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Ni, Longhui Wei, Jiacheng Li, Siliang Tang, Yueting Zhuang, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the unrestricted nature of the content in the training data, large
+text-to-image diffusion models, such as Stable Diffusion (SD), are capable of
+generating images with potentially copyrighted or dangerous content based on
+corresponding textual concepts information. This includes specific intellectual
+property (IP), human faces, and various artistic styles. However, Negative
+Prompt, a widely used method for content removal, frequently fails to conceal
+this content due to inherent limitations in its inference logic. In this work,
+we propose a novel strategy named \textbf{Degeneration-Tuning (DT)} to shield
+contents of unwanted concepts from SD weights. By utilizing Scrambled Grid to
+reconstruct the correlation between undesired concepts and their corresponding
+image domain, we guide SD to generate meaningless content when such textual
+concepts are provided as input. As this adaptation occurs at the level of the
+model's weights, the SD, after DT, can be grafted onto other conditional
+diffusion frameworks like ControlNet to shield unwanted concepts. In addition
+to qualitatively showcasing the effectiveness of our DT method in protecting
+various types of concepts, a quantitative comparison of the SD before and after
+DT indicates that the DT method does not significantly impact the generative
+quality of other contents. The FID and IS scores of the model on COCO-30K
+exhibit only minor changes after DT, shifting from 12.61 and 39.20 to 13.04 and
+38.25, respectively, which clearly outperforms the previous methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">16</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Your Negative May not Be True Negative: Boosting Image-Text Matching
+  with False Negative Elimination <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Yi Bin, Junrong Liao, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing image-text matching methods adopt triplet loss as the
+optimization objective, and choosing a proper negative sample for the triplet
+of <anchor, positive, negative> is important for effectively training the
+model, e.g., hard negatives make the model learn efficiently and effectively.
+However, we observe that existing methods mainly employ the most similar
+samples as hard negatives, which may not be true negatives. In other words, the
+samples with high similarity but not paired with the anchor may reserve
+positive semantic associations, and we call them false negatives. Repelling
+these false negatives in triplet loss would mislead the semantic representation
+learning and result in inferior retrieval performance. In this paper, we
+propose a novel False Negative Elimination (FNE) strategy to select negatives
+via sampling, which could alleviate the problem introduced by false negatives.
+Specifically, we first construct the distributions of positive and negative
+samples separately via their similarities with the anchor, based on the
+features extracted from image and text encoders. Then we calculate the false
+negative probability of a given sample based on its similarity with the anchor
+and the above distributions via the Bayes' rule, which is employed as the
+sampling weight during negative sampling process. Since there may not exist any
+false negative in a small batch size, we design a memory module with momentum
+to retain a large negative buffer and implement our negative sampling strategy
+spanning over the buffer. In addition, to make the model focus on hard
+negatives, we reassign the sampling weights for the simple negatives with a
+cut-down strategy. The extensive experiments are conducted on Flickr30K and
+MS-COCO, and the results demonstrate the superiority of our proposed false
+negative elimination strategy. The code is available at
+https://github.com/LuminosityX/FNE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying Two-Stream Encoders with <span class="highlight-title">Transformer</span>s for Cross-Modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Bin, Haoxuan Li, Yahui Xu, Xing Xu, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing cross-modal retrieval methods employ two-stream encoders with
+different architectures for images and texts, \textit{e.g.}, CNN for images and
+RNN/Transformer for texts. Such discrepancy in architectures may induce
+different semantic distribution spaces and limit the interactions between
+images and texts, and further result in inferior alignment between images and
+texts. To fill this research gap, inspired by recent advances of Transformers
+in vision tasks, we propose to unify the encoder architectures with
+Transformers for both modalities. Specifically, we design a cross-modal
+retrieval framework purely based on two-stream Transformers, dubbed
+\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image
+Transformer, a text Transformer, and a hierarchical alignment module. With such
+identical architectures, the encoders could produce representations with more
+similar characteristics for images and texts, and make the interactions and
+alignments between them much easier. Besides, to leverage the rich semantics,
+we devise a hierarchical alignment scheme to explore multi-level
+correspondences of different layers between images and texts. To evaluate the
+effectiveness of the proposed HAT, we conduct extensive experiments on two
+benchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that
+HAT outperforms SOTA baselines by a large margin. Specifically, on two key
+tasks, \textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves
+7.6\% and 16.7\% relative score improvement of Recall@1 on MSCOCO, and 4.4\%
+and 11.6\% on Flickr30k respectively. The code is available at
+\url{https://github.com/LuminosityX/HAT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Natural-Language Based Audio Retrieval with PaSST and Large
+  Audio-Caption Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Primus, Khaled Koutini, Gerhard Widmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a text-to-audio-retrieval system based on pre-trained text
+and spectrogram transformers. Our method projects recordings and textual
+descriptions into a shared audio-caption space in which related examples from
+different modalities are close. Through a systematic analysis, we examine how
+each component of the system influences retrieval performance. As a result, we
+identify two key components that play a crucial role in driving performance:
+the self-attention-based audio encoder for audio embedding and the utilization
+of additional human-generated and synthetic data sets during pre-training. We
+further experimented with augmenting ClothoV2 captions with available keywords
+to increase their variety; however, this only led to marginal improvements. Our
+system ranked first in the 2023's DCASE Challenge, and it outperforms the
+current state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to DCASE Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniRecSys: A Unified Framework for Personalized, Group, Package, and
+  Package-to-Group Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adamya Shyam, Vikas Kumar, Venkateswara Rao Kagita, Arun K Pujari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems aim to enhance the overall user experience by providing
+tailored recommendations for a variety of products and services. These systems
+help users make more informed decisions, leading to greater user satisfaction
+with the platform. However, the implementation of these systems largely depends
+on the context, which can vary from recommending an item or package to a user
+or a group. This requires careful exploration of several models during the
+deployment, as there is no comprehensive and unified approach that deals with
+recommendations at different levels. Furthermore, these individual models must
+be closely attuned to their generated recommendations depending on the context
+to prevent significant variation in their generated recommendations. In this
+paper, we propose a novel unified recommendation framework that addresses all
+four recommendation tasks, namely personalized, group, package, or
+package-to-group recommendation, filling the gap in the current research
+landscape. The proposed framework can be integrated with most of the
+traditional matrix factorization-based collaborative filtering models. The idea
+is to enhance the formulation of the existing approaches by incorporating
+components focusing on the exploitation of the group and package latent
+factors. These components also help in exploiting a rich latent representation
+of the user/item by enforcing them to align closely with their corresponding
+group/package representation. We consider two prominent CF techniques,
+Regularized Matrix Factorization and Maximum Margin Matrix factorization, as
+the baseline models and demonstrate their customization to various
+recommendation tasks. Experiment results on two publicly available datasets are
+reported, comparing them to other baseline approaches that consider individual
+rating feedback for group or package recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpinionConv: Conversational Product Search with Grounded Opinions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vahid Sadiri Javadi, Martin Potthast, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When searching for products, the opinions of others play an important role in
+making informed decisions. Subjective experiences about a product can be a
+valuable source of information. This is also true in sales conversations, where
+a customer and a sales assistant exchange facts and opinions about products.
+However, training an AI for such conversations is complicated by the fact that
+language models do not possess authentic opinions for their lack of real-world
+experience. We address this problem by leveraging product reviews as a rich
+source of product opinions to ground conversational AI in true subjective
+narratives. With OpinionConv, we develop the first conversational AI for
+simulating sales conversations. To validate the generated conversations, we
+conduct several user studies showing that the generated opinions are perceived
+as realistic. Our assessors also confirm the importance of opinions as an
+informative basis for decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding and Modeling Passive-Negative Feedback for Short-video
+  Sequential Recommendation <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhu Pan, Chen Gao, Jianxin Chang, Yanan Niu, Yang Song, Kun Gai, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation is one of the most important tasks in recommender
+systems, which aims to recommend the next interacted item with historical
+behaviors as input. Traditional sequential recommendation always mainly
+considers the collected positive feedback such as click, purchase, etc.
+However, in short-video platforms such as TikTok, video viewing behavior may
+not always represent positive feedback. Specifically, the videos are played
+automatically, and users passively receive the recommended videos. In this new
+scenario, users passively express negative feedback by skipping over videos
+they do not like, which provides valuable information about their preferences.
+Different from the negative feedback studied in traditional recommender
+systems, this passive-negative feedback can reflect users' interests and serve
+as an important supervision signal in extracting users' preferences. Therefore,
+it is essential to carefully design and utilize it in this novel recommendation
+scenario. In this work, we first conduct analyses based on a large-scale
+real-world short-video behavior dataset and illustrate the significance of
+leveraging passive feedback. We then propose a novel method that deploys the
+sub-interest encoder, which incorporates positive feedback and passive-negative
+feedback as supervision signals to learn the user's current active
+sub-interest. Moreover, we introduce an adaptive fusion layer to integrate
+various sub-interests effectively. To enhance the robustness of our model, we
+then introduce a multi-task learning module to simultaneously optimize two
+kinds of feedback -- passive-negative feedback and traditional randomly-sampled
+negative feedback. The experiments on two large-scale datasets verify that the
+proposed method can significantly outperform state-of-the-art approaches. The
+code is released at https://github.com/tsinghua-fib-lab/RecSys2023-SINE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Distillation-enhanced Multi-modal <span class="highlight-title">Transformer</span> for Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Ji, Xiangyan Liu, An Zhang, Yinwei Wei, Yongxin Ni, Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal recommendation systems, which integrate diverse types of
+information, have gained widespread attention in recent years. However,
+compared to traditional collaborative filtering-based multi-modal
+recommendation systems, research on multi-modal sequential recommendation is
+still in its nascent stages. Unlike traditional sequential recommendation
+models that solely rely on item identifier (ID) information and focus on
+network structure design, multi-modal recommendation models need to emphasize
+item representation learning and the fusion of heterogeneous data sources. This
+paper investigates the impact of item representation learning on downstream
+recommendation tasks and examines the disparities in information fusion at
+different stages. Empirical experiments are conducted to demonstrate the need
+to design a framework suitable for collaborative learning and fusion of diverse
+information. Based on this, we propose a new model-agnostic framework for
+multi-modal sequential recommendation tasks, called Online
+Distillation-enhanced Multi-modal Transformer (ODMT), to enhance feature
+interaction and mutual learning among multi-source input (ID, text, and image),
+while avoiding conflicts among different features during training, thereby
+improving recommendation accuracy. To be specific, we first introduce an
+ID-aware Multi-modal Transformer module in the item representation learning
+stage to facilitate information interaction among different features. Secondly,
+we employ an online distillation training strategy in the prediction
+optimization stage to make multi-source data learn from each other and improve
+prediction robustness. Experimental results on a video content recommendation
+dataset and three e-commerce recommendation datasets demonstrate the
+effectiveness of the proposed two modules, which is approximately 10%
+improvement in performance compared to baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Foundation Models for Information Synthesis of Wireless
+  Communication Specifications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manikanta Kotaru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches to understanding, developing and researching modern
+wireless communication technologies involves time-intensive and arduous process
+of sifting through numerous webpages and technical specification documents,
+gathering the required information and synthesizing it. This paper presents
+NextGen Communications Copilot, a conversational artificial intelligence tool
+for information synthesis of wireless communication specifications. The system
+builds on top of recent advancements in foundation models and consists of three
+key additional components: a domain-specific database, a context extractor, and
+a feedback mechanism. The system appends user queries with concise and
+query-dependent contextual information extracted from a database of wireless
+technical specifications and incorporates tools for expert feedback and data
+contributions. On evaluation using a benchmark dataset of queries and reference
+responses created by subject matter experts, the system demonstrated more
+relevant and accurate answers with an average BLEU score and BERTScore
+F1-measure of 0.37 and 0.79 respectively compared to the corresponding values
+of 0.07 and 0.59 achieved by state-of-the-art tools like ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Top K Relevant Passage Retrieval for Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering is a task that answers factoid questions using a large
+collection of documents. It aims to provide precise answers in response to the
+user's questions in natural language. Question answering relies on efficient
+passage retrieval to select candidate contexts, where traditional sparse vector
+space models, such as TF-IDF or BM25, are the de facto method. On the web,
+there is no single article that could provide all the possible answers
+available on the internet to the question of the problem asked by the user. The
+existing Dense Passage Retrieval model has been trained on Wikipedia dump from
+Dec. 20, 2018, as the source documents for answering questions. Question
+answering (QA) has made big strides with several open-domain and machine
+comprehension systems built using large-scale annotated datasets. However, in
+the clinical domain, this problem remains relatively unexplored. According to
+multiple surveys, Biomedical Questions cannot be answered correctly from
+Wikipedia Articles. In this work, we work on the existing DPR framework for the
+biomedical domain and retrieve answers from the Pubmed articles which is a
+reliable source to answer medical questions. When evaluated on a BioASQ QA
+dataset, our fine-tuned dense retriever results in a 0.81 F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. arXiv admin note: text overlap with
+  arXiv:2004.04906 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Spatiotemporal Features of Online Food Recommendation
+  Service <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaochuan Lin, Jiayan Pei, Taotao Zhou, Hengxu He, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online Food Recommendation Service (OFRS) has remarkable spatiotemporal
+characteristics and the advantage of being able to conveniently satisfy users'
+needs in a timely manner. There have been a variety of studies that have begun
+to explore its spatiotemporal properties, but a comprehensive and in-depth
+analysis of the OFRS spatiotemporal features is yet to be conducted. Therefore,
+this paper studies the OFRS based on three questions: how spatiotemporal
+features play a role; why self-attention cannot be used to model the
+spatiotemporal sequences of OFRS; and how to combine spatiotemporal features to
+improve the efficiency of OFRS. Firstly, through experimental analysis, we
+systemically extracted the spatiotemporal features of OFRS, identified the most
+valuable features and designed an effective combination method. Secondly, we
+conducted a detailed analysis of the spatiotemporal sequences, which revealed
+the shortcomings of self-attention in OFRS, and proposed a more optimized
+spatiotemporal sequence method for replacing self-attention. In addition, we
+also designed a Dynamic Context Adaptation Model to further improve the
+efficiency and performance of OFRS. Through the offline experiments on two
+large datasets and online experiments for a week, the feasibility and
+superiority of our model were proven.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Granularity Attention Model for Group Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianye Ji, Jiayan Pei, Shaochuan Lin, Taotao Zhou, Hengxu He, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group recommendation provides personalized recommendations to a group of
+users based on their shared interests, preferences, and characteristics.
+Current studies have explored different methods for integrating individual
+preferences and making collective decisions that benefit the group as a whole.
+However, most of them heavily rely on users with rich behavior and ignore
+latent preferences of users with relatively sparse behavior, leading to
+insufficient learning of individual interests. To address this challenge, we
+present the Multi-Granularity Attention Model (MGAM), a novel approach that
+utilizes multiple levels of granularity (i.e., subsets, groups, and supersets)
+to uncover group members' latent preferences and mitigate recommendation noise.
+Specially, we propose a Subset Preference Extraction module that enhances the
+representation of users' latent subset-level preferences by incorporating their
+previous interactions with items and utilizing a hierarchical mechanism.
+Additionally, our method introduces a Group Preference Extraction module and a
+Superset Preference Extraction module, which explore users' latent preferences
+on two levels: the group-level, which maintains users' original preferences,
+and the superset-level, which includes group-group exterior information. By
+incorporating the subset-level embedding, group-level embedding, and
+superset-level embedding, our proposed method effectively reduces group
+recommendation noise across multiple granularities and comprehensively learns
+individual interests. Extensive offline and online experiments have
+demonstrated the superiority of our method in terms of performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning To Rank Diversely At Airbnb 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malay Haldar, Mustafa Abdool, Liwei He, Dillon Davis, Huiji Gao, Sanjeev Katariya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Airbnb is a two-sided marketplace, bringing together hosts who own listings
+for rent, with prospective guests from around the globe. Applying neural
+network-based learning to rank techniques has led to significant improvements
+in matching guests with hosts. These improvements in ranking were driven by a
+core strategy: order the listings by their estimated booking probabilities,
+then iterate on techniques to make these booking probability estimates more and
+more accurate. Embedded implicitly in this strategy was an assumption that the
+booking probability of a listing could be determined independently of other
+listings in search results. In this paper we discuss how this assumption,
+pervasive throughout the commonly-used learning to rank frameworks, is false.
+We provide a theoretical foundation correcting this assumption, followed by
+efficient neural network architectures based on the theory. Explicitly
+accounting for possible similarities between listings, and reducing them to
+diversify the search results generated strong positive impact. We discuss these
+metric wins as part of the online A/B tests of the theory. Our method provides
+a practical way to diversify search results for large-scale production ranking
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Search ranking, Diversity, e-commerce</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT4Rec: Simple yet Effective Consistency Training for Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.06668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.06668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Liu, Xiaoyang Liu, Rongqin Zheng, Lixin Zhang, Xiaobo Liang, Juntao Li, Lijun Wu, Min Zhang, Leyu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation methods play an important role in real-world
+recommender systems. These systems are able to catch user preferences by taking
+advantage of historical records and then performing recommendations.
+Contrastive learning(CL) is a cutting-edge technology that can assist us in
+obtaining informative user representations, but these CL-based models need
+subtle negative sampling strategies, tedious data augmentation methods, and
+heavy hyper-parameters tuning work. In this paper, we introduce another way to
+generate better user representations and recommend more attractive items to
+users. Particularly, we put forward an effective \textbf{C}onsistency
+\textbf{C}onstraint for sequential \textbf{Rec}ommendation(C$^2$-Rec) in which
+only two extra training objectives are used without any structural
+modifications and data augmentation strategies. Substantial experiments have
+been conducted on three benchmark datasets and one real industrial dataset,
+which proves that our proposed method outperforms SOTA models substantially.
+Furthermore, our method needs much less training time than those CL-based
+models. Online AB-test on real-world recommendation systems also achieves
+10.141\% improvement on the click-through rate and 10.541\% increase on the
+average click number per capita. The code is available at
+\url{https://github.com/zhengrongqin/C2-Rec}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Document Layout Annotation: Database and Benchmark in the Domain of
+  Public Affairs <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Peña, Aythami Morales, Julian Fierrez, Javier Ortega-Garcia, Marcos Grande, Iñigo Puente, Jorge Cordova, Gonzalo Cordova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Every day, thousands of digital documents are generated with useful
+information for companies, public organizations, and citizens. Given the
+impossibility of processing them manually, the automatic processing of these
+documents is becoming increasingly necessary in certain sectors. However, this
+task remains challenging, since in most cases a text-only based parsing is not
+enough to fully understand the information presented through different
+components of varying significance. In this regard, Document Layout Analysis
+(DLA) has been an interesting research field for many years, which aims to
+detect and classify the basic components of a document. In this work, we used a
+procedure to semi-automatically annotate digital documents with different
+layout labels, including 4 basic layout blocks and 4 text categories. We apply
+this procedure to collect a novel database for DLA in the public affairs
+domain, using a set of 24 data sources from the Spanish Administration. The
+database comprises 37.9K documents with more than 441K document pages, and more
+than 8M labels associated to 8 layout block units. The results of our
+experiments validate the proposed text labeling procedure with accuracy up to
+99%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICDAR 2023 Workshop on Machine Vision and NLP for
+  Document Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PCDF: A Parallel-Computing Distributed Framework for Sponsored Search
+  Advertising Serving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12893v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12893v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Xu, Hao Qi, Kunyao Wang, Pei Wang, Guowei Zhang, Congcong Liu, Junsheng Jin, Xiwei Zhao, Zhangang Lin, Jinghe Hu, Jingping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional online advertising systems for sponsored search follow a cascade
+paradigm with retrieval, pre-ranking,ranking, respectively. Constrained by
+strict requirements on online inference efficiency, it tend to be difficult to
+deploy useful but computationally intensive modules in the ranking stage.
+Moreover, ranking models currently used in the industry assume the user click
+only relies on the advertisements itself, which results in the ranking stage
+overlooking the impact of organic search results on the predicted
+advertisements (ads). In this work, we propose a novel framework
+PCDF(Parallel-Computing Distributed Framework), allowing to split the
+computation cost into three parts and to deploy them in the pre-module in
+parallel with the retrieval stage, the middle-module for ranking ads, and the
+post-module for re-ranking ads with external items. Our PCDF effectively
+reduces the overall inference latency compared with the classic framework. The
+whole module is end-to-end offline training and adapt for the online learning
+paradigm. To our knowledge, we are the first to propose an end-to-end solution
+for online training and deployment on complex CTR models from the system
+framework side.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">95</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When More is Less: Incorporating Additional <span class="highlight-title">Dataset</span>s Can Hurt
+  Performance By Introducing Spurious Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rhys Compton, Lily Zhang, Aahlad Puli, Rajesh Ranganath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning, incorporating more data is often seen as a reliable
+strategy for improving model performance; this work challenges that notion by
+demonstrating that the addition of external datasets in many cases can hurt the
+resulting model's performance. In a large-scale empirical study across
+combinations of four different open-source chest x-ray datasets and 9 different
+labels, we demonstrate that in 43% of settings, a model trained on data from
+two hospitals has poorer worst group accuracy over both hospitals than a model
+trained on just a single hospital's data. This surprising result occurs even
+though the added hospital makes the training distribution more similar to the
+test distribution. We explain that this phenomenon arises from the spurious
+correlation that emerges between the disease and hospital, due to
+hospital-specific image artifacts. We highlight the trade-off one encounters
+when training on multiple datasets, between the obvious benefit of additional
+data and insidious cost of the introduced spurious correlation. In some cases,
+balancing the dataset can remove the spurious correlation and improve
+performance, but it is not always an effective strategy. We contextualize our
+results within the literature on spurious correlations to help explain these
+outcomes. Our experiments underscore the importance of exercising caution when
+selecting training data for machine learning models, especially in settings
+where there is a risk of spurious correlations such as with medical imaging.
+The risks outlined highlight the need for careful data selection and model
+evaluation in future research and practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MLHC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SILO Language Models: Isolating Legal Risk In a Nonparametric Datastore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sewon Min, Suchin Gururangan, Eric Wallace, Hannaneh Hajishirzi, Noah A. Smith, Luke Zettlemoyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The legality of training language models (LMs) on copyrighted or otherwise
+restricted data is under intense debate. However, as we show, model performance
+significantly degrades if trained only on low-risk text (e.g., out-of-copyright
+books or government documents), due to its limited size and domain coverage. We
+present SILO, a new language model that manages this risk-performance tradeoff
+during inference. SILO is built by (1) training a parametric LM on Open License
+Corpus (OLC), a new corpus we curate with 228B tokens of public domain and
+permissively licensed text and (2) augmenting it with a more general and easily
+modifiable nonparametric datastore (e.g., containing copyrighted books or news)
+that is only queried during inference. The datastore allows use of high-risk
+data without training on it, supports sentence-level data attribution, and
+enables data producers to opt out from the model by removing content from the
+store. These capabilities can foster compliance with data-use regulations such
+as the fair use doctrine in the United States and the GDPR in the European
+Union. Our experiments show that the parametric LM struggles on domains not
+covered by OLC. However, access to the datastore greatly improves out of domain
+performance, closing 90% of the performance gap with an LM trained on the Pile,
+a more diverse corpus with mostly high-risk text. We also analyze which
+nonparametric approach works best, where the remaining errors lie, and how
+performance scales with datastore size. Our results suggest that it is possible
+to build high quality language models while mitigating their legal risk.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages; 6 figures. Code, models, and data available at
+  https://github.com/kernelmachine/silo-lm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Learning Operators to Optimality from Multi-Task Non-IID Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas T. C. K. Zhang, Leonardo F. Toso, James Anderson, Nikolai Matni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A powerful concept behind much of the recent progress in machine learning is
+the extraction of common features across data from heterogeneous sources or
+tasks. Intuitively, using all of one's data to learn a common representation
+function benefits both computational effort and statistical generalization by
+leaving a smaller number of parameters to fine-tune on a given task. Toward
+theoretically grounding these merits, we propose a general setting of
+recovering linear operators $M$ from noisy vector measurements $y = Mx + w$,
+where the covariates $x$ may be both non-i.i.d. and non-isotropic. We
+demonstrate that existing isotropy-agnostic meta-learning approaches incur
+biases on the representation update, which causes the scaling of the noise
+terms to lose favorable dependence on the number of source tasks. This in turn
+can cause the sample complexity of representation learning to be bottlenecked
+by the single-task data size. We introduce an adaptation, $\texttt{De-bias &
+Feature-Whiten}$ ($\texttt{DFW}$), of the popular alternating
+minimization-descent (AMD) scheme proposed in Collins et al., (2021), and
+establish linear convergence to the optimal representation with noise level
+scaling down with the $\textit{total}$ source data size. This leads to
+generalization bounds on the same order as an oracle empirical risk minimizer.
+We verify the vital importance of $\texttt{DFW}$ on various numerical
+simulations. In particular, we show that vanilla alternating-minimization
+descent fails catastrophically even for iid, but mildly non-isotropic data. Our
+analysis unifies and generalizes prior work, and provides a flexible framework
+for a wider range of applications, such as in controls and dynamical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep-Learning Method Using Auto-encoder and Generative Adversarial
+  Network for Anomaly Detection on Ancient Stone Stele Surfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikun Liu, Yuning Wang, Cheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection of natural deterioration and man-made damage on the
+surfaces of ancient stele in the first instance is essential for their
+preventive conservation. Existing methods for cultural heritage preservation
+are not able to achieve this goal perfectly due to the difficulty of balancing
+accuracy, efficiency, timeliness, and cost. This paper presents a deep-learning
+method to automatically detect above mentioned emergencies on ancient stone
+stele in real time, employing autoencoder (AE) and generative adversarial
+network (GAN). The proposed method overcomes the limitations of existing
+methods by requiring no extensive anomaly samples while enabling comprehensive
+detection of unpredictable anomalies. the method includes stages of monitoring,
+data acquisition, pre-processing, model structuring, and post-processing.
+Taking the Longmen Grottoes' stone steles as a case study, an unsupervised
+learning model based on AE and GAN architectures is proposed and validated with
+a reconstruction accuracy of 99.74\%. The method's evaluation revealed the
+proficient detection of seven artificially designed anomalies and demonstrated
+precision and reliability without false alarms. This research provides novel
+ideas and possibilities for the application of deep learning in the field of
+cultural heritage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffCR: A Fast Conditional Diffusion Framework for Cloud Removal from
+  Optical Satellite Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Yu Zhang, Shiying Wang, Lei Jin, Pin Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical satellite images are a critical data source; however, cloud cover
+often compromises their quality, hindering image applications and analysis.
+Consequently, effectively removing clouds from optical satellite images has
+emerged as a prominent research direction. While recent advancements in cloud
+removal primarily rely on generative adversarial networks, which may yield
+suboptimal image quality, diffusion models have demonstrated remarkable success
+in diverse image-generation tasks, showcasing their potential in addressing
+this challenge. This paper presents a novel framework called DiffCR, which
+leverages conditional guided diffusion with deep convolutional networks for
+high-performance cloud removal for optical satellite imagery. Specifically, we
+introduce a decoupled encoder for conditional image feature extraction,
+providing a robust color representation to ensure the close similarity of
+appearance information between the conditional input and the synthesized
+output. Moreover, we propose a novel and efficient time and condition fusion
+block within the cloud removal model to accurately simulate the correspondence
+between the appearance in the conditional image and the target image at a low
+computational cost. Extensive experimental evaluations on two commonly used
+benchmark datasets demonstrate that DiffCR consistently achieves
+state-of-the-art performance on all metrics, with parameter and computational
+complexities amounting to only 5.1% and 5.4%, respectively, of those previous
+best methods. The source code, pre-trained models, and all the experimental
+results will be publicly available at https://github.com/XavierJiezou/DiffCR
+upon the paper's acceptance of this work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Invariant Learning with Randomized Linear Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Cotta, Gal Yehuda, Assaf Schuster, Chris J. Maddison
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing models that are both expressive and preserve known invariances of
+tasks is an increasingly hard problem. Existing solutions tradeoff invariance
+for computational or memory resources. In this work, we show how to leverage
+randomness and design models that are both expressive and invariant but use
+less resources. Inspired by randomized algorithms, our key insight is that
+accepting probabilistic notions of universal approximation and invariance can
+reduce our resource requirements. More specifically, we propose a class of
+binary classification models called Randomized Linear Classifiers (RLCs). We
+give parameter and sample size conditions in which RLCs can, with high
+probability, approximate any (smooth) function while preserving invariance to
+compact group transformations. Leveraging this result, we design three RLCs
+that are provably probabilistic invariant for classification tasks over sets,
+graphs, and spherical data. We show how these models can achieve probabilistic
+invariance and universality using less resources than (deterministic) neural
+networks and their invariant counterparts. Finally, we empirically demonstrate
+the benefits of this new class of models on invariant tasks where deterministic
+invariant neural networks are known to struggle.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XGBD: Explanation-Guided Graph Backdoor Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04406v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04406v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Guan, Mengnan Du, Ninghao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks pose a significant security risk to graph learning models.
+Backdoors can be embedded into the target model by inserting backdoor triggers
+into the training dataset, causing the model to make incorrect predictions when
+the trigger is present. To counter backdoor attacks, backdoor detection has
+been proposed. An emerging detection strategy in the vision and NLP domains is
+based on an intriguing phenomenon: when training models on a mixture of
+backdoor and clean samples, the loss on backdoor samples drops significantly
+faster than on clean samples, allowing backdoor samples to be easily detected
+by selecting samples with the lowest loss values. However, the ignorance of
+topological feature information on graph data limits its detection
+effectiveness when applied directly to the graph domain. To this end, we
+propose an explanation-guided backdoor detection method to take advantage of
+the topological information. Specifically, we train a helper model on the graph
+dataset, feed graph samples into the model, and then adopt explanation methods
+to attribute model prediction to an important subgraph. We observe that
+backdoor samples have distinct attribution distribution than clean samples, so
+the explanatory subgraph could serve as more discriminative features for
+detecting backdoor samples. Comprehensive experiments on multiple popular
+datasets and attack methods demonstrate the effectiveness and explainability of
+our method. Our code is available:
+https://github.com/GuanZihan/GNN_backdoor_detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event Abstraction for Enterprise Collaboration Systems to Support Social
+  Process Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Blatt, Patrick Delfmann, Petra Schubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One aim of Process Mining (PM) is the discovery of process models from event
+logs of information systems. PM has been successfully applied to
+process-oriented enterprise systems but is less suited for communication- and
+document-oriented Enterprise Collaboration Systems (ECS). ECS event logs are
+very fine-granular and PM applied to their logs results in spaghetti models. A
+common solution for this is event abstraction, i.e., converting low-level logs
+into more abstract high-level logs before running discovery algorithms. ECS
+logs come with special characteristics that have so far not been fully
+addressed by existing event abstraction approaches. We aim to close this gap
+with a tailored ECS event abstraction (ECSEA) approach that trains a model by
+comparing recorded actual user activities (high-level traces) with the
+system-generated low-level traces (extracted from the ECS). The model allows us
+to automatically convert future low-level traces into an abstracted high-level
+log that can be used for PM. Our evaluation shows that the algorithm produces
+accurate results. ECSEA is a preprocessing method that is essential for the
+interpretation of collaborative work activity in ECS, which we call Social
+Process Mining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation-Based Unsupervised Domain Adaptation In Medical
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Nørgaard Llambias, Mads Nielsen, Mostafa Mehdipour Ghazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based models in medical imaging often struggle to generalize
+effectively to new scans due to data heterogeneity arising from differences in
+hardware, acquisition parameters, population, and artifacts. This limitation
+presents a significant challenge in adopting machine learning models for
+clinical practice. We propose an unsupervised method for robust domain
+adaptation in brain MRI segmentation by leveraging MRI-specific augmentation
+techniques. To evaluate the effectiveness of our method, we conduct extensive
+experiments across diverse datasets, modalities, and segmentation tasks,
+comparing against the state-of-the-art methods. The results show that our
+proposed approach achieves high accuracy, exhibits broad applicability, and
+showcases remarkable robustness against domain shift in various tasks,
+surpassing the state-of-the-art performance in the majority of cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the Effect of Counterfactual Explanations on Trust and
+  Reliance on AI for Human-AI Collaborative Clinical Decision Making <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Hun Lee, Chong Jun Chew
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence (AI) is increasingly being considered to assist human
+decision-making in high-stake domains (e.g. health). However, researchers have
+discussed an issue that humans can over-rely on wrong suggestions of the AI
+model instead of achieving human AI complementary performance. In this work, we
+utilized salient feature explanations along with what-if, counterfactual
+explanations to make humans review AI suggestions more analytically to reduce
+overreliance on AI and explored the effect of these explanations on trust and
+reliance on AI during clinical decision-making. We conducted an experiment with
+seven therapists and ten laypersons on the task of assessing post-stroke
+survivors' quality of motion, and analyzed their performance, agreement level
+on the task, and reliance on AI without and with two types of AI explanations.
+Our results showed that the AI model with both salient features and
+counterfactual explanations assisted therapists and laypersons to improve their
+performance and agreement level on the task when `right' AI outputs are
+presented. While both therapists and laypersons over-relied on `wrong' AI
+outputs, counterfactual explanations assisted both therapists and laypersons to
+reduce their over-reliance on `wrong' AI outputs by 21\% compared to salient
+feature explanations. Specifically, laypersons had higher performance degrades
+by 18.0 f1-score with salient feature explanations and 14.0 f1-score with
+counterfactual explanations than therapists with performance degrades of 8.6
+and 2.8 f1-scores respectively. Our work discusses the potential of
+counterfactual explanations to better estimate the accuracy of an AI model and
+reduce over-reliance on `wrong' AI outputs and implications for improving
+human-AI collaborative decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM CSCW 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pelta: Shielding <span class="highlight-title">Transformer</span>s to Mitigate Evasion Attacks in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Queyrut, Yérom-David Bromberg, Valerio Schiavoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main premise of federated learning is that machine learning model updates
+are computed locally, in particular to preserve user data privacy, as those
+never leave the perimeter of their device. This mechanism supposes the general
+model, once aggregated, to be broadcast to collaborating and non malicious
+nodes. However, without proper defenses, compromised clients can easily probe
+the model inside their local memory in search of adversarial examples. For
+instance, considering image-based applications, adversarial examples consist of
+imperceptibly perturbed images (to the human eye) misclassified by the local
+model, which can be later presented to a victim node's counterpart model to
+replicate the attack. To mitigate such malicious probing, we introduce Pelta, a
+novel shielding mechanism leveraging trusted hardware. By harnessing the
+capabilities of Trusted Execution Environments (TEEs), Pelta masks part of the
+back-propagation chain rule, otherwise typically exploited by attackers for the
+design of malicious samples. We evaluate Pelta on a state of the art ensemble
+model and demonstrate its effectiveness against the Self Attention Gradient
+adversarial Attack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super
+  Learner Equation Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J. Vowels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference is a crucial goal of science, enabling researchers to arrive
+at meaningful conclusions regarding the predictions of hypothetical
+interventions using observational data. Path models, Structural Equation Models
+(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to
+unambiguously specify assumptions regarding the causal structure underlying a
+phenomenon. Unlike DAGs, which make very few assumptions about the functional
+and parametric form, SEM assumes linearity. This can result in functional
+misspecification which prevents researchers from undertaking reliable effect
+size estimation. In contrast, we propose Super Learner Equation Modeling, a
+path modeling technique integrating machine learning Super Learner ensembles.
+We empirically demonstrate its ability to provide consistent and unbiased
+estimates of causal effects, its competitive performance for linear models when
+compared with SEM, and highlight its superiority over SEM when dealing with
+non-linear relationships. We provide open-source code, and a tutorial notebook
+with example usage, accentuating the easy-to-use nature of the method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate, Explainable, and Private Models: Providing Recourse While
+  Minimizing Training Data Leakage <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catherine Huang, Chelse Swoopes, Christina Xiao, Jiaqi Ma, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are increasingly utilized across impactful domains to
+predict individual outcomes. As such, many models provide algorithmic recourse
+to individuals who receive negative outcomes. However, recourse can be
+leveraged by adversaries to disclose private information. This work presents
+the first attempt at mitigating such attacks. We present two novel methods to
+generate differentially private recourse: Differentially Private Model (DPM)
+and Laplace Recourse (LR). Using logistic regression classifiers and real world
+and synthetic datasets, we find that DPM and LR perform well in reducing what
+an adversary can infer, especially at low FPR. When training dataset size is
+large enough, we find particular success in preventing privacy leakage while
+maintaining model and recourse accuracy with our novel LR method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of The Second Workshop on New Frontiers in Adversarial
+  Machine Learning (AdvML-Frontiers @ ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLHF-Blender: A Configurable Interactive Interface for Learning from
+  Diverse Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannick Metz, David Lindner, Raphaël Baur, Daniel Keim, Mennatallah El-Assady
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To use reinforcement learning from human feedback (RLHF) in practical
+applications, it is crucial to learn reward models from diverse sources of
+human feedback and to consider human factors involved in providing feedback of
+different types. However, the systematic study of learning from diverse types
+of feedback is held back by limited standardized tooling available to
+researchers. To bridge this gap, we propose RLHF-Blender, a configurable,
+interactive interface for learning from human feedback. RLHF-Blender provides a
+modular experimentation framework and implementation that enables researchers
+to systematically investigate the properties and qualities of human feedback
+for reward learning. The system facilitates the exploration of various feedback
+types, including demonstrations, rankings, comparisons, and natural language
+instructions, as well as studies considering the impact of human factors on
+their effectiveness. We discuss a set of concrete research opportunities
+enabled by RLHF-Blender. More information is available at
+https://rlhfblender.info/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Multi-agent Bandits: Distributed Algorithms with Optimal
+  Individual Regret and Constant Communication Costs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Yang, Xuchuang Wang, Mohammad Hajiesmaili, Lijun Zhang, John C. S. Lui, Don Towsley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been extensive study of cooperative multi-agent
+multi-armed bandits where a set of distributed agents cooperatively play the
+same multi-armed bandit game. The goal is to develop bandit algorithms with the
+optimal group and individual regrets and low communication between agents. The
+prior work tackled this problem using two paradigms: leader-follower and fully
+distributed algorithms. Prior algorithms in both paradigms achieve the optimal
+group regret. The leader-follower algorithms achieve constant communication
+costs but fail to achieve optimal individual regrets. The state-of-the-art
+fully distributed algorithms achieve optimal individual regrets but fail to
+achieve constant communication costs. This paper presents a simple yet
+effective communication policy and integrates it into a learning algorithm for
+cooperative bandits. Our algorithm achieves the best of both paradigms: optimal
+individual regret and constant communication costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Model Inversion Eavesdropping Attack in Semantic Communication
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Chen, Qianqian Yang, Zhiguo Shi, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, semantic communication has been a popular research topic for
+its superiority in communication efficiency. As semantic communication relies
+on deep learning to extract meaning from raw messages, it is vulnerable to
+attacks targeting deep learning models. In this paper, we introduce the model
+inversion eavesdropping attack (MIEA) to reveal the risk of privacy leaks in
+the semantic communication system. In MIEA, the attacker first eavesdrops the
+signal being transmitted by the semantic communication system and then performs
+model inversion attack to reconstruct the raw message, where both the white-box
+and black-box settings are considered. Evaluation results show that MIEA can
+successfully reconstruct the raw message with good quality under different
+channel conditions. We then propose a defense method based on random
+permutation and substitution to defend against MIEA in order to achieve secure
+semantic communication. Our experimental results demonstrate the effectiveness
+of the proposed defense method in preventing MIEA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2023 IEEE Global Communications Conference (GLOBECOM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of the wav2vec 2.0 Feature Extractor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Vieting, Ralf Schlüter, Hermann Ney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic speech recognition (ASR) systems typically use handcrafted feature
+extraction pipelines. To avoid their inherent information loss and to achieve
+more consistent modeling from speech to transcribed text, neural raw waveform
+feature extractors (FEs) are an appealing approach. Also the wav2vec 2.0 model,
+which has recently gained large popularity, uses a convolutional FE which
+operates directly on the speech waveform. However, it is not yet studied
+extensively in the literature. In this work, we study its capability to replace
+the standard feature extraction methods in a connectionist temporal
+classification (CTC) ASR model and compare it to an alternative neural FE. We
+show that both are competitive with traditional FEs on the LibriSpeech
+benchmark and analyze the effect of the individual components. Furthermore, we
+analyze the learned filters and show that the most important information for
+the ASR system is obtained by a set of bandpass filters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ITG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Alignment: Chat with Vanilla Language Models Before
+  Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochuang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this note, we explore inference-time alignment through in-context
+learning. We consider a vanilla pretrained language model Llama-2 before any
+fine-tuning and retrieve an average of 9 demonstration alignment examples when
+the model is prompted to follow chat-style instructions. Compared to direct
+prompting, the in-context alignment without changing model weights leads to a
+7x increase in win-rate w.r.t. the text-davinci-003 model from OpenAI, making
+the vanilla language model comparable to strong baselines with alignment
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teacher-Student Architecture for Knowledge Distillation: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Xuan Li, Dan Liu, Haolun Wu, Xi Chen, Ju Wang, Xue Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Deep neural networks (DNNs) have shown a strong capacity to solve
+large-scale problems in many areas, such DNNs are hard to be deployed in
+real-world systems due to their voluminous parameters. To tackle this issue,
+Teacher-Student architectures were proposed, where simple student networks with
+a few parameters can achieve comparable performance to deep teacher networks
+with many parameters. Recently, Teacher-Student architectures have been
+effectively and widely embraced on various knowledge distillation (KD)
+objectives, including knowledge compression, knowledge expansion, knowledge
+adaptation, and knowledge enhancement. With the help of Teacher-Student
+architectures, current studies are able to achieve multiple distillation
+objectives through lightweight and generalized student networks. Different from
+existing KD surveys that primarily focus on knowledge compression, this survey
+first explores Teacher-Student architectures across multiple distillation
+objectives. This survey presents an introduction to various knowledge
+representations and their corresponding optimization objectives. Additionally,
+we provide a systematic overview of Teacher-Student architectures with
+representative learning algorithms and effective distillation schemes. This
+survey also summarizes recent applications of Teacher-Student architectures
+across multiple purposes, including classification, recognition, generation,
+ranking, and regression. Lastly, potential research directions in KD are
+investigated, focusing on architecture design, knowledge quality, and
+theoretical studies of regression-based learning, respectively. Through this
+comprehensive survey, industry practitioners and the academic community can
+gain valuable insights and guidelines for effectively designing, learning, and
+applying Teacher-Student architectures on various distillation objectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages. arXiv admin note: substantial text overlap with
+  arXiv:2210.17332</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Veysel Cagatan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces BarlowRL, a data-efficient reinforcement learning agent
+that combines the Barlow Twins self-supervised learning framework with DER
+(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its
+contrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids
+dimensional collapse by enforcing information spread to the whole space. This
+helps RL algorithms to utilize uniformly spread state representation that
+eventually results in a remarkable performance. The integration of Barlow Twins
+with DER enhances data efficiency and achieves superior performance in the RL
+tasks. BarlowRL demonstrates the potential of incorporating self-supervised
+learning techniques to improve RL algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SDLFormer: A Sparse and Dense Locality-enhanced <span class="highlight-title">Transformer</span> for
+  Accelerated MR Image Reconstruction <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul G. S., Sriprabha Ramnarayanan, Mohammad Al Fahim, Keerthi Ram, Preejith S. P, Mohanasankar Sivaprakasam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as viable alternatives to convolutional neural
+networks owing to their ability to learn non-local region relationships in the
+spatial domain. The self-attention mechanism of the transformer enables
+transformers to capture long-range dependencies in the images, which might be
+desirable for accelerated MRI image reconstruction as the effect of
+undersampling is non-local in the image domain. Despite its computational
+efficiency, the window-based transformers suffer from restricted receptive
+fields as the dependencies are limited to within the scope of the image
+windows. We propose a window-based transformer network that integrates dilated
+attention mechanism and convolution for accelerated MRI image reconstruction.
+The proposed network consists of dilated and dense neighborhood attention
+transformers to enhance the distant neighborhood pixel relationship and
+introduce depth-wise convolutions within the transformer module to learn
+low-level translation invariant features for accelerated MRI image
+reconstruction. The proposed model is trained in a self-supervised manner. We
+perform extensive experiments for multi-coil MRI acceleration for coronal PD,
+coronal PDFS and axial T2 contrasts with 4x and 5x under-sampling in
+self-supervised learning based on k-space splitting. We compare our method
+against other reconstruction architectures and the parallel domain
+self-supervised learning baseline. Results show that the proposed model
+exhibits improvement margins of (i) around 1.40 dB in PSNR and around 0.028 in
+SSIM on average over other architectures (ii) around 1.44 dB in PSNR and around
+0.029 in SSIM over parallel domain self-supervised learning. The code is
+available at https://github.com/rahul-gs-16/sdlformer.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI workshop MILLanD 2023 Medical Image Learning with
+  noisy and Limited Data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Natural-Language Based Audio Retrieval with PaSST and Large
+  Audio-Caption Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Primus, Khaled Koutini, Gerhard Widmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a text-to-audio-retrieval system based on pre-trained text
+and spectrogram transformers. Our method projects recordings and textual
+descriptions into a shared audio-caption space in which related examples from
+different modalities are close. Through a systematic analysis, we examine how
+each component of the system influences retrieval performance. As a result, we
+identify two key components that play a crucial role in driving performance:
+the self-attention-based audio encoder for audio embedding and the utilization
+of additional human-generated and synthetic data sets during pre-training. We
+further experimented with augmenting ClothoV2 captions with available keywords
+to increase their variety; however, this only led to marginal improvements. Our
+system ranked first in the 2023's DCASE Challenge, and it outperforms the
+current state of the art on the ClothoV2 benchmark by 5.6 pp. mAP@10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to DCASE Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Inference with Reliable Uncertainty Quantification over
+  Wireless Channels via Conformal Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meiyi Zhu, Matteo Zecchin, Sangwoo Park, Caili Guo, Chunyan Feng, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Consider a setting in which devices and a server share a pre-trained model.
+The server wishes to make an inference on a new input given the model. Devices
+have access to data, previously not used for training, and can communicate to
+the server over a common wireless channel. If the devices have no access to the
+new input, can communication from devices to the server enhance the quality of
+the inference decision at the server? Recent work has introduced federated
+conformal prediction (CP), which leverages devices-to-server communication to
+improve the reliability of the server's decision. With federated CP, devices
+communicate to the server information about the loss accrued by the shared
+pre-trained model on the local data, and the server leverages this information
+to calibrate a decision interval, or set, so that it is guaranteed to contain
+the correct answer with a pre-defined target reliability level. Previous work
+assumed noise-free communication, whereby devices can communicate a single real
+number to the server. In this paper, we study for the first time federated CP
+in a wireless setting. We introduce a novel protocol, termed wireless federated
+conformal prediction (WFCP), which builds on type-based multiple access (TBMA)
+and on a novel quantile correction strategy. WFCP is proved to provide formal
+reliability guarantees in terms of coverage of the predicted set produced by
+the server. Using numerical results, we demonstrate the significant advantages
+of WFCP against digital implementations of existing federated CP schemes,
+especially in regimes with limited communication resources and/or large number
+of devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpinionConv: Conversational Product Search with Grounded Opinions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vahid Sadiri Javadi, Martin Potthast, Lucie Flek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When searching for products, the opinions of others play an important role in
+making informed decisions. Subjective experiences about a product can be a
+valuable source of information. This is also true in sales conversations, where
+a customer and a sales assistant exchange facts and opinions about products.
+However, training an AI for such conversations is complicated by the fact that
+language models do not possess authentic opinions for their lack of real-world
+experience. We address this problem by leveraging product reviews as a rich
+source of product opinions to ground conversational AI in true subjective
+narratives. With OpinionConv, we develop the first conversational AI for
+simulating sales conversations. To validate the generated conversations, we
+conduct several user studies showing that the generated opinions are perceived
+as realistic. Our assessors also confirm the importance of opinions as an
+informative basis for decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Interpretation and Validation of Graph Attention-based
+  Explanations for GNN Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Efimia Panagiotaki, Daniele De Martini, Lars Kunze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a methodology for investigating the application of
+semantic attention to enhance the explainability of Graph Neural Network
+(GNN)-based models, introducing semantically-informed perturbations and
+establishing a correlation between predicted feature-importance weights and
+model accuracy. Graph Deep Learning (GDL) has emerged as a promising field for
+tasks like scene interpretation, leveraging flexible graph structures to
+concisely describe complex features and relationships. As traditional
+explainability methods used in eXplainable AI (XAI) cannot be directly applied
+to such structures, graph-specific approaches are introduced. Attention
+mechanisms have demonstrated their efficacy in estimating the importance of
+input features in deep learning models and thus have been previously employed
+to provide feature-based explanations for GNN predictions. Building upon these
+insights, we extend existing attention-based graph-explainability methods
+investigating the use of attention weights as importance indicators of
+semantically sorted feature sets. Through analysing the behaviour of predicted
+attention-weights distribution in correlation with model accuracy, we gain
+valuable insights into feature importance with respect to the behaviour of the
+GNN model. We apply our methodology to a lidar pointcloud estimation model
+successfully identifying key semantic classes that contribute to enhanced
+performance effectively generating reliable post-hoc semantic explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Varying-coefficients for regional quantile via KNN-based LASSO with
+  applications to health outcome study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyoung Park, Eun Ryung Lee, Hyokyoung G. Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Health outcomes, such as body mass index and cholesterol levels, are known to
+be dependent on age and exhibit varying effects with their associated risk
+factors. In this paper, we propose a novel framework for dynamic modeling of
+the associations between health outcomes and risk factors using
+varying-coefficients (VC) regional quantile regression via K-nearest neighbors
+(KNN) fused Lasso, which captures the time-varying effects of age. The proposed
+method has strong theoretical properties, including a tight estimation error
+bound and the ability to detect exact clustered patterns under certain
+regularity conditions. To efficiently solve the resulting optimization problem,
+we develop an alternating direction method of multipliers (ADMM) algorithm. Our
+empirical results demonstrate the efficacy of the proposed method in capturing
+the complex age-dependent associations between health outcomes and their risk
+factors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Sketching for Secure Coded Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neophytos Charalambides, Hessam Mahdavifar, Mert Pilanci, Alfred O. Hero III
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose methods for speeding up linear regression
+distributively, while ensuring security. We leverage randomized sketching
+techniques, and improve straggler resilience in asynchronous systems.
+Specifically, we apply a random orthonormal matrix and then subsample
+\textit{blocks}, to simultaneously secure the information and reduce the
+dimension of the regression problem. In our setup, the transformation
+corresponds to an encoded encryption in an \textit{approximate gradient coding
+scheme}, and the subsampling corresponds to the responses of the non-straggling
+workers; in a centralized coded computing network. This results in a
+distributive \textit{iterative sketching} approach for an $\ell_2$-subspace
+embedding, \textit{i.e.} a new sketch is considered at each iteration. We also
+focus on the special case of the \textit{Subsampled Randomized Hadamard
+Transform}, which we generalize to block sampling; and discuss how it can be
+modified in order to secure the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 7 figures. arXiv admin note: substantial text overlap with
+  arXiv:2201.08522</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Socially Unacceptable Discourse Classification (SUD) through
+  different eyes: "Are we on the same page ?" 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Machado Carneiro, Michele Linardi, Julien Longhi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Socially Unacceptable Discourse (SUD) characterization and detection
+in online text. We first build and present a novel corpus that contains a large
+variety of manually annotated texts from different online sources used so far
+in state-of-the-art Machine learning (ML) SUD detection solutions. This global
+context allows us to test the generalization ability of SUD classifiers that
+acquire knowledge around the same SUD categories, but from different contexts.
+From this perspective, we can analyze how (possibly) different annotation
+modalities influence SUD learning by discussing open challenges and open
+research directions. We also provide several data insights which can support
+domain experts in the annotation task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual input neural networks for positional sound source localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Grinstein, Vincent W. Neo, Patrick A. Naylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many signal processing applications, metadata may be advantageously used
+in conjunction with a high dimensional signal to produce a desired output. In
+the case of classical Sound Source Localization (SSL) algorithms, information
+from a high dimensional, multichannel audio signals received by many
+distributed microphones is combined with information describing acoustic
+properties of the scene, such as the microphones' coordinates in space, to
+estimate the position of a sound source. We introduce Dual Input Neural
+Networks (DI-NNs) as a simple and effective way to model these two data types
+in a neural network. We train and evaluate our proposed DI-NN on scenarios of
+varying difficulty and realism and compare it against an alternative
+architecture, a classical Least-Squares (LS) method as well as a classical
+Convolutional Recurrent Neural Network (CRNN). Our results show that the DI-NN
+significantly outperforms the baselines, achieving a five times lower
+localization error than the LS method and two times lower than the CRNN in a
+test dataset of real recordings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive Assessment of the Performance of Deep Learning Classifiers
+  Reveals a Surprising Lack of Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael W. Spratling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable and robust evaluation methods are a necessary first step towards
+developing machine learning models that are themselves robust and reliable.
+Unfortunately, current evaluation protocols typically used to assess
+classifiers fail to comprehensively evaluate performance as they tend to rely
+on limited types of test data, and ignore others. For example, using the
+standard test data fails to evaluate the predictions made by the classifier to
+samples from classes it was not trained on. On the other hand, testing with
+data containing samples from unknown classes fails to evaluate how well the
+classifier can predict the labels for known classes. This article advocates
+bench-marking performance using a wide range of different types of data and
+using a single metric that can be applied to all such data types to produce a
+consistent evaluation of performance. Using such a benchmark it is found that
+current deep neural networks, including those trained with methods that are
+believed to produce state-of-the-art robustness, are extremely vulnerable to
+making mistakes on certain types of data. This means that such models will be
+unreliable in real-world scenarios where they may encounter data from many
+different domains, and that they are insecure as they can easily be fooled into
+making the wrong decisions. It is hoped that these results will motivate the
+wider adoption of more comprehensive testing methods that will, in turn, lead
+to the development of more robust machine learning methods in the future.
+  Code is available at:
+\url{https://codeberg.org/mwspratling/RobustnessEvaluation}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion
+  and Infinite Data Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyang Yu, Shihao Wang, Yuan Fang, Wangpeng An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OmniDataComposer, an innovative approach for multimodal
+data fusion and unlimited data generation with an intent to refine and
+uncomplicate interplay among diverse data modalities. Coming to the core
+breakthrough, it introduces a cohesive data structure proficient in processing
+and merging multimodal data inputs, which include video, audio, and text. Our
+crafted algorithm leverages advancements across multiple operations such as
+video/image caption extraction, dense caption extraction, Automatic Speech
+Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything
+Model(RAM), and object tracking. OmniDataComposer is capable of identifying
+over 6400 categories of objects, substantially broadening the spectrum of
+visual information. It amalgamates these diverse modalities, promoting
+reciprocal enhancement among modalities and facilitating cross-modal data
+correction. \textbf{The final output metamorphoses each video input into an
+elaborate sequential document}, virtually transmuting videos into thorough
+narratives, making them easier to be processed by large language models. Future
+prospects include optimizing datasets for each modality to encourage unlimited
+data generation. This robust base will offer priceless insights to models like
+ChatGPT, enabling them to create higher quality datasets for video captioning
+and easing question-answering tasks based on video content. OmniDataComposer
+inaugurates a new stage in multimodal learning, imparting enormous potential
+for augmenting AI's understanding and generation of complex, real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constructing Custom Thermodynamics Using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoli Chen, Beatrice W. Soh, Zi-En Ooi, Eleonore Vissol-Gaudin, Haijun Yu, Kostya S. Novoselov, Kedar Hippalgaonkar, Qianxiao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most exciting applications of AI is automated scientific discovery
+based on previously amassed data, coupled with restrictions provided by the
+known physical principles, including symmetries and conservation laws. Such
+automated hypothesis creation and verification can assist scientists in
+studying complex phenomena, where traditional physical intuition may fail. Of
+particular importance are complex dynamic systems where their time evolution is
+strongly influenced by varying external parameters. In this paper we develop a
+platform based on a generalised Onsager principle to learn macroscopic
+dynamical descriptions of arbitrary stochastic dissipative systems directly
+from observations of their microscopic trajectories. We focus on systems whose
+complexity and sheer sizes render complete microscopic description impractical,
+and constructing theoretical macroscopic models requires extensive domain
+knowledge or trial-and-error. Our machine learning approach addresses this by
+simultaneously constructing reduced thermodynamic coordinates and interpreting
+the dynamics on these coordinates. We demonstrate our method by studying
+theoretically and validating experimentally, the stretching of long polymer
+chains in an externally applied field. Specifically, we learn three
+interpretable thermodynamic coordinates and build a dynamical landscape of
+polymer stretching, including (1) the identification of stable and transition
+states and (2) the control of the stretching rate. We further demonstrate the
+universality of our approach by applying it to an unrelated problem in a
+different domain: constructing macroscopic dynamics for spatial epidemics,
+showing that our method addresses wide scientific and technological
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable machine learning to enable high-throughput electrical
+  conductivity optimization of doped conjugated polymers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Wei Yoon, Adithya Kumar, Pawan Kumar, Kedar Hippalgaonkar, J Senthilnath, Vijila Chellappan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The combination of high-throughput experimentation techniques and machine
+learning (ML) has recently ushered in a new era of accelerated material
+discovery, enabling the identification of materials with cutting-edge
+properties. However, the measurement of certain physical quantities remains
+challenging to automate. Specifically, meticulous process control,
+experimentation and laborious measurements are required to achieve optimal
+electrical conductivity in doped polymer materials. We propose a ML approach,
+which relies on readily measured absorbance spectra, to accelerate the workflow
+associated with measuring electrical conductivity. The first ML model
+(classification model), accurately classifies samples with a conductivity >~25
+to 100 S/cm, achieving a maximum of 100% accuracy rate. For the subset of
+highly conductive samples, we employed a second ML model (regression model), to
+predict their conductivities, yielding an impressive test R2 value of 0.984. To
+validate the approach, we showed that the models, neither trained on the
+samples with the two highest conductivities of 498 and 506 S/cm, were able to,
+in an extrapolative manner, correctly classify and predict them at satisfactory
+levels of errors. The proposed ML workflow results in an improvement in the
+efficiency of the conductivity measurements by 89% of the maximum achievable
+using our experimental techniques. Furthermore, our approach addressed the
+common challenge of the lack of explainability in ML models by exploiting
+bespoke mathematical properties of the descriptors and ML model, allowing us to
+gain corroborated insights into the spectral influences on conductivity.
+Through this study, we offer an accelerated pathway for optimizing the
+properties of doped polymer materials while showcasing the valuable insights
+that can be derived from purposeful utilization of ML in experimental science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 Pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Asynchronous Evolution of Deep Neural Network Architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Liang, Hormoz Shahrzad, Risto Miikkulainen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many evolutionary algorithms (EAs) take advantage of parallel evaluation of
+candidates. However, if evaluation times vary significantly, many worker nodes
+(i.e.,\ compute clients) are idle much of the time, waiting for the next
+generation to be created. Evolutionary neural architecture search (ENAS), a
+class of EAs that optimizes the architecture and hyperparameters of deep neural
+networks, is particularly vulnerable to this issue. This paper proposes a
+generic asynchronous evaluation strategy (AES) that is then adapted to work
+with ENAS. AES increases throughput by maintaining a queue of upto $K$
+individuals ready to be sent to the workers for evaluation and proceeding to
+the next generation as soon as $M<<K$ individuals have been evaluated by the
+workers. A suitable value for $M$ is determined experimentally, balancing
+diversity and efficiency. To showcase the generality and power of AES, it was
+first evaluated in 11-bit multiplexer design (a single-population verifiable
+discovery task) and then scaled up to ENAS for image captioning (a
+multi-population open-ended-optimization task). In both problems, a multifold
+performance improvement was observed, suggesting that AES is a promising method
+for parallelizing the evolution of complex systems with long and variable
+evaluation times, such as those in ENAS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application-Oriented Benchmarking of Quantum Generative Learning Using
+  QUARK 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian J. Kiwit, Marwa Marso, Philipp Ross, Carlos A. Riofrío, Johannes Klepsch, Andre Luckow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarking of quantum machine learning (QML) algorithms is challenging due
+to the complexity and variability of QML systems, e.g., regarding model
+ansatzes, data sets, training techniques, and hyper-parameters selection. The
+QUantum computing Application benchmaRK (QUARK) framework simplifies and
+standardizes benchmarking studies for quantum computing applications. Here, we
+propose several extensions of QUARK to include the ability to evaluate the
+training and deployment of quantum generative models. We describe the updated
+software architecture and illustrate its flexibility through several example
+applications: (1) We trained different quantum generative models using several
+circuit ansatzes, data sets, and data transformations. (2) We evaluated our
+models on GPU and real quantum hardware. (3) We assessed the generalization
+capabilities of our generative models using a broad set of metrics that
+capture, e.g., the novelty and validity of the generated data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Zeroth-Order Optimization using Trajectory-Informed Surrogate
+  Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Shu, Xiaoqiang Lin, Zhongxiang Dai, Bryan Kian Hsiang Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated optimization, an emerging paradigm which finds wide real-world
+applications such as federated learning, enables multiple clients (e.g., edge
+devices) to collaboratively optimize a global function. The clients do not
+share their local datasets and typically only share their local gradients.
+However, the gradient information is not available in many applications of
+federated optimization, which hence gives rise to the paradigm of federated
+zeroth-order optimization (ZOO). Existing federated ZOO algorithms suffer from
+the limitations of query and communication inefficiency, which can be
+attributed to (a) their reliance on a substantial number of function queries
+for gradient estimation and (b) the significant disparity between their
+realized local updates and the intended global updates. To this end, we (a)
+introduce trajectory-informed gradient surrogates which is able to use the
+history of function queries during optimization for accurate and
+query-efficient gradient estimation, and (b) develop the technique of adaptive
+gradient correction using these gradient surrogates to mitigate the
+aforementioned disparity. Based on these, we propose the federated zeroth-order
+optimization using trajectory-informed surrogate gradients (FZooS) algorithm
+for query- and communication-efficient federated ZOO. Our FZooS achieves
+theoretical improvements over the existing approaches, which is supported by
+our real-world experiments such as federated black-box adversarial attack and
+federated non-differentiable metric optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Specialized Activation Functions for Physics-informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghui Wang, Lu Lu, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) are known to suffer from
+optimization difficulty. In this work, we reveal the connection between the
+optimization difficulty of PINNs and activation functions. Specifically, we
+show that PINNs exhibit high sensitivity to activation functions when solving
+PDEs with distinct properties. Existing works usually choose activation
+functions by inefficient trial-and-error. To avoid the inefficient manual
+selection and to alleviate the optimization difficulty of PINNs, we introduce
+adaptive activation functions to search for the optimal function when solving
+different problems. We compare different adaptive activation functions and
+discuss their limitations in the context of PINNs. Furthermore, we propose to
+tailor the idea of learning combinations of candidate activation functions to
+the PINNs optimization, which has a higher requirement for the smoothness and
+diversity on learned functions. This is achieved by removing activation
+functions which cannot provide higher-order derivatives from the candidate set
+and incorporating elementary functions with different properties according to
+our prior knowledge about the PDE at hand. We further enhance the search space
+with adaptive slopes. The proposed adaptive activation function can be used to
+solve different PDE systems in an interpretable way. Its effectiveness is
+demonstrated on a series of benchmarks. Code is available at
+https://github.com/LeapLabTHU/AdaAFforPINNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path Signatures for Diversity in Probabilistic Trajectory Optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Barcelos, Tin Lai, Rafael Oliveira, Paulo Borges, Fabio Ramos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion planning can be cast as a trajectory optimisation problem where a cost
+is minimised as a function of the trajectory being generated. In complex
+environments with several obstacles and complicated geometry, this optimisation
+problem is usually difficult to solve and prone to local minima. However,
+recent advancements in computing hardware allow for parallel trajectory
+optimisation where multiple solutions are obtained simultaneously, each
+initialised from a different starting point. Unfortunately, without a strategy
+preventing two solutions to collapse on each other, naive parallel optimisation
+can suffer from mode collapse diminishing the efficiency of the approach and
+the likelihood of finding a global solution. In this paper we leverage on
+recent advances in the theory of rough paths to devise an algorithm for
+parallel trajectory optimisation that promotes diversity over the range of
+solutions, therefore avoiding mode collapses and achieving better global
+properties. Our approach builds on path signatures and Hilbert space
+representations of trajectories, and connects parallel variational inference
+for trajectory estimation with diversity promoting kernels. We empirically
+demonstrate that this strategy achieves lower average costs than competing
+alternatives on a range of problems, from 2D navigation to robotic manipulators
+operating in cluttered environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConDistFL: Conditional Distillation for Federated Learning from
+  Partially Annotated Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pochuan Wang, Chen Shen, Weichung Wang, Masahiro Oda, Chiou-Shann Fuh, Kensaku Mori, Holger R. Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a generalized segmentation model capable of simultaneously
+delineating multiple organs and diseases is highly desirable. Federated
+learning (FL) is a key technology enabling the collaborative development of a
+model without exchanging training data. However, the limited access to fully
+annotated training data poses a major challenge to training generalizable
+models. We propose "ConDistFL", a framework to solve this problem by combining
+FL with knowledge distillation. Local models can extract the knowledge of
+unlabeled organs and tumors from partially annotated data from the global model
+with an adequately designed conditional probability representation. We validate
+our framework on four distinct partially annotated abdominal CT datasets from
+the MSD and KiTS19 challenges. The experimental results show that the proposed
+framework significantly outperforms FedAvg and FedOpt baselines. Moreover, the
+performance on an external test dataset demonstrates superior generalizability
+compared to models trained on each dataset separately. Our ablation study
+suggests that ConDistFL can perform well without frequent aggregation, reducing
+the communication cost of FL. Our implementation will be available at
+https://github.com/NVIDIA/NVFlare/tree/dev/research/condist-fl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Adversarial Robustness in Low-Label Regime via Adaptively
+  Weighted Regularization and Knowledge Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Yang, Insung Kong, Yongdai Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial robustness is a research area that has recently received a lot of
+attention in the quest for trustworthy artificial intelligence. However, recent
+works on adversarial robustness have focused on supervised learning where it is
+assumed that labeled data is plentiful. In this paper, we investigate
+semi-supervised adversarial training where labeled data is scarce. We derive
+two upper bounds for the robust risk and propose a regularization term for
+unlabeled data motivated by these two upper bounds. Then, we develop a
+semi-supervised adversarial training algorithm that combines the proposed
+regularization term with knowledge distillation using a semi-supervised teacher
+(i.e., a teacher model trained using a semi-supervised learning algorithm). Our
+experiments show that our proposed algorithm achieves state-of-the-art
+performance with significant margins compared to existing algorithms. In
+particular, compared to supervised learning algorithms, performance of our
+proposed algorithm is not much worse even when the amount of labeled data is
+very small. For example, our algorithm with only 8\% labeled data is comparable
+to supervised adversarial training algorithms that use all labeled data, both
+in terms of standard and robust accuracies on CIFAR-10.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages - Manuscript, 6 pages - Appendix, Accepted in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Improving Predictive Risk Modelling for New Zealand's Child
+  Welfare System Using Clustering Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahar Barmomanesh, Victor Miranda-Soberanis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The combination of clinical judgement and predictive risk models crucially
+assist social workers to segregate children at risk of maltreatment and decide
+when authorities should intervene. Predictive risk modelling to address this
+matter has been initiated by several governmental welfare authorities worldwide
+involving administrative data and machine learning algorithms. While previous
+studies have investigated risk factors relating to child maltreatment, several
+gaps remain as to understanding how such risk factors interact and whether
+predictive risk models perform differently for children with different
+features. By integrating Principal Component Analysis and K-Means clustering,
+this paper presents initial findings of our work on the identification of such
+features as well as their potential effect on current risk modelling
+frameworks. This approach allows examining existent, unidentified yet, clusters
+of New Zealand (NZ) children reported with care and protection concerns, as
+well as to analyse their inner structure, and evaluate the performance of
+prediction models trained cluster wise. We aim to discover the extent of
+clustering degree required as an early step in the development of predictive
+risk models for child maltreatment and so enhance the accuracy of such models
+intended for use by child protection authorities. The results from testing
+LASSO logistic regression models trained on identified clusters revealed no
+significant difference in their performance. The models, however, performed
+slightly better for two clusters including younger children. our results
+suggest that separate models might need to be developed for children of certain
+age to gain additional control over the error rates and to improve model
+accuracy. While results are promising, more evidence is needed to draw
+definitive conclusions, and further investigation is necessary.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Five-Dollar Model: Generating Game Maps and Sprites from Sentence
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy Merino, Roman Negri, Dipika Rajesh, M Charity, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The five-dollar model is a lightweight text-to-image generative architecture
+that generates low dimensional images from an encoded text prompt. This model
+can successfully generate accurate and aesthetically pleasing content in low
+dimensional domains, with limited amounts of training data. Despite the small
+size of both the model and datasets, the generated images are still able to
+maintain the encoded semantic meaning of the textual prompt. We apply this
+model to three small datasets: pixel art video game maps, video game sprite
+images, and down-scaled emoji images and apply novel augmentation strategies to
+improve the performance of our model on these limited datasets. We evaluate our
+models performance using cosine similarity score between text-image pairs
+generated by the CLIP VIT-B/32 model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in AIIDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Models for Anomaly Detection and Design-Space Dimensionality
+  Reduction in Shape Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danny D'Agostino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our work presents a novel approach to shape optimization, that has the
+twofold objective to improve the efficiency of global optimization algorithms
+while promoting the generation of high-quality designs during the optimization
+process free of geometrical anomalies. This is accomplished by reducing the
+number of the original design variables defining a new reduced subspace where
+the geometrical variance is maximized and modeling the underlying generative
+process of the data via probabilistic linear latent variable models such as
+Factor Analysis and Probabilistic Principal Component Analysis. We show that
+the data follows approximately a Gaussian distribution when the shape
+modification method is linear and the design variables are sampled uniformly at
+random, due to the direct application of the central limit theorem. The model
+uncertainty is measured in terms of Mahalanobis distance, and the paper
+demonstrates that anomalous designs tend to exhibit a high value of this
+metric. This enables the definition of a new optimization model where anomalous
+geometries are penalized and consequently avoided during the optimization loop.
+The procedure is demonstrated for hull shape optimization of the DTMB 5415
+model, extensively used as an international benchmark for shape optimization
+problems. The global optimization routine is carried out using Bayesian
+Optimization and the DIRECT algorithm. From the numerical results, the new
+framework improves the convergence of global optimization algorithms, while
+only designs with high-quality geometrical features are generated through the
+optimization routine thereby avoiding the wastage of precious computationally
+expensive simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study on TF-IDF feature Weighting Method and its Analysis
+  using Unstructured <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamata Das, Selvakumar K., P. J. A. Alphonse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text Classification is the process of categorizing text into the relevant
+categories and its algorithms are at the core of many Natural Language
+Processing (NLP). Term Frequency-Inverse Document Frequency (TF-IDF) and NLP
+are the most highly used information retrieval methods in text classification.
+We have investigated and analyzed the feature weighting method for text
+classification on unstructured data. The proposed model considered two features
+N-Grams and TF-IDF on the IMDB movie reviews and Amazon Alexa reviews dataset
+for sentiment analysis. Then we have used the state-of-the-art classifier to
+validate the method i.e., Support Vector Machine (SVM), Logistic Regression,
+Multinomial Naive Bayes (Multinomial NB), Random Forest, Decision Tree, and
+k-nearest neighbors (KNN). From those two feature extractions, a significant
+increase in feature extraction with TF-IDF features rather than based on
+N-Gram. TF-IDF got the maximum accuracy (93.81%), precision (94.20%), recall
+(93.81%), and F1-score (91.99%) value in Random Forest classifier.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, COLINS-2021, 5th International Conference on
+  Computational Linguistics and Intelligent Systems, April 22-23, 2021,
+  Kharkiv, Ukraine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Top K Relevant Passage Retrieval for Biomedical Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering is a task that answers factoid questions using a large
+collection of documents. It aims to provide precise answers in response to the
+user's questions in natural language. Question answering relies on efficient
+passage retrieval to select candidate contexts, where traditional sparse vector
+space models, such as TF-IDF or BM25, are the de facto method. On the web,
+there is no single article that could provide all the possible answers
+available on the internet to the question of the problem asked by the user. The
+existing Dense Passage Retrieval model has been trained on Wikipedia dump from
+Dec. 20, 2018, as the source documents for answering questions. Question
+answering (QA) has made big strides with several open-domain and machine
+comprehension systems built using large-scale annotated datasets. However, in
+the clinical domain, this problem remains relatively unexplored. According to
+multiple surveys, Biomedical Questions cannot be answered correctly from
+Wikipedia Articles. In this work, we work on the existing DPR framework for the
+biomedical domain and retrieve answers from the Pubmed articles which is a
+reliable source to answer medical questions. When evaluated on a BioASQ QA
+dataset, our fine-tuned dense retriever results in a 0.81 F1 score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. arXiv admin note: text overlap with
+  arXiv:2004.04906 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scope Loss for Imbalanced Classification and RL Exploration <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasham Burhani, Xiao Qi Shi, Jonathan Jaegerman, Daniel Balicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate equivalence between the reinforcement learning problem and the
+supervised classification problem. We consequently equate the exploration
+exploitation trade-off in reinforcement learning to the dataset imbalance
+problem in supervised classification, and find similarities in how they are
+addressed. From our analysis of the aforementioned problems we derive a novel
+loss function for reinforcement learning and supervised classification. Scope
+Loss, our new loss function, adjusts gradients to prevent performance losses
+from over-exploitation and dataset imbalances, without the need for any tuning.
+We test Scope Loss against SOTA loss functions over a basket of benchmark
+reinforcement learning tasks and a skewed classification dataset, and show that
+Scope Loss outperforms other loss functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 figures, under review for NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Performance of Semi-Supervised Learning by Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Yang, Kunwoong Kim, Yongdai Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning (SSL) algorithm is a setup built upon a realistic
+assumption that access to a large amount of labeled data is tough. In this
+study, we present a generalized framework, named SCAR, standing for Selecting
+Clean samples with Adversarial Robustness, for improving the performance of
+recent SSL algorithms. By adversarially attacking pre-trained models with
+semi-supervision, our framework shows substantial advances in classifying
+images. We introduce how adversarial attacks successfully select high-confident
+unlabeled data to be labeled with current predictions. On CIFAR10, three recent
+SSL algorithms with SCAR result in significantly improved image classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual <span class="highlight-title">Pre-Train</span>ing of Large Language Models: How to (re)warm your
+  model? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats L. Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, Timothée Lesort
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are routinely pre-trained on billions of tokens,
+only to restart the process over again once new data becomes available. A much
+cheaper and more efficient solution would be to enable the continual
+pre-training of these models, i.e. updating pre-trained models with new data
+instead of re-training them from scratch. However, the distribution shift
+induced by novel data typically results in degraded performance on past data.
+Taking a step towards efficient continual pre-training, in this work, we
+examine the effect of different warm-up strategies. Our hypothesis is that the
+learning rate must be re-increased to improve compute efficiency when training
+on a new dataset. We study the warmup phase of models pre-trained on the Pile
+(upstream data, 300B tokens) as we continue to pre-train on SlimPajama
+(downstream data, 297B tokens), following a linear warmup and cosine decay
+schedule. We conduct all experiments on the Pythia 410M language model
+architecture and evaluate performance through validation perplexity. We
+experiment with different pre-training checkpoints, various maximum learning
+rates, and various warmup lengths. Our results show that while rewarming models
+first increases the loss on upstream and downstream data, in the longer run it
+improves the downstream performance, outperforming models trained from
+scratch$\unicode{x2013}$even for a large downstream dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization bound for estimating causal effects from observational
+  network data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruichu Cai, Zeqin Yang, Weilin Chen, Yuguang Yan, Zhifeng Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating causal effects from observational network data is a significant
+but challenging problem. Existing works in causal inference for observational
+network data lack an analysis of the generalization bound, which can
+theoretically provide support for alleviating the complex confounding bias and
+practically guide the design of learning objectives in a principled manner. To
+fill this gap, we derive a generalization bound for causal effect estimation in
+network scenarios by exploiting 1) the reweighting schema based on joint
+propensity score and 2) the representation learning schema based on Integral
+Probability Metric (IPM). We provide two perspectives on the generalization
+bound in terms of reweighting and representation learning, respectively.
+Motivated by the analysis of the bound, we propose a weighting regression
+method based on the joint propensity score augmented with representation
+learning. Extensive experimental studies on two real-world networks with
+semi-synthetic data demonstrate the effectiveness of our algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding CNN Hidden Neuron Activations using Structured Background
+  Knowledge and Deductive Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhilekha Dalal, Md Kamruzzaman Sarker, Adrita Barua, Eugene Vasserman, Pascal Hitzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in Explainable AI is in correctly interpreting activations
+of hidden neurons: accurate interpretations would provide insights into the
+question of what a deep learning system has internally detected as relevant on
+the input, de-mystifying the otherwise black-box character of deep learning
+systems. The state of the art indicates that hidden node activations can, in
+some cases, be interpretable in a way that makes sense to humans, but
+systematic automated methods that would be able to hypothesize and verify
+interpretations of hidden neuron activations are underexplored. In this paper,
+we provide such a method and demonstrate that it provides meaningful
+interpretations. Our approach is based on using large-scale background
+knowledge approximately 2 million classes curated from the Wikipedia concept
+hierarchy together with a symbolic reasoning approach called Concept Induction
+based on description logics, originally developed for applications in the
+Semantic Web field. Our results show that we can automatically attach
+meaningful labels from the background knowledge to individual neurons in the
+dense layer of a Convolutional Neural Network through a hypothesis and
+verification process
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Multi-Type Multi-Agent Deep Reinforcement Learning for
+  Resource Management in Space-Air-Ground Integrated Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengxi Zhang, Huaze Tang, Wenbo Ding, Xiao-Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Space-Air-Ground Integrated Network (SAGIN), integrating heterogeneous
+devices including low earth orbit (LEO) satellites, unmanned aerial vehicles
+(UAVs), and ground users (GUs), holds significant promise for advancing smart
+city applications. However, resource management of the SAGIN is a challenge
+requiring urgent study in that inappropriate resource management will cause
+poor data transmission, and hence affect the services in smart cities. In this
+paper, we develop a comprehensive SAGIN system that encompasses five distinct
+communication links and propose an efficient cooperative multi-type multi-agent
+deep reinforcement learning (CMT-MARL) method to address the resource
+management issue. The experimental results highlight the efficacy of the
+proposed CMT-MARL, as evidenced by key performance indicators such as the
+overall transmission rate and transmission success rate. These results
+underscore the potential value and feasibility of future implementation of the
+SAGIN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier neural operator for real-time simulation of 3D dynamic urban
+  microclimate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhui Peng, Shaoxiang Qin, Senwen Yang, Jianchun Wang, Xue Liu,  Liangzhu,  Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global urbanization has underscored the significance of urban microclimates
+for human comfort, health, and building/urban energy efficiency. They
+profoundly influence building design and urban planning as major environmental
+impacts. Understanding local microclimates is essential for cities to prepare
+for climate change and effectively implement resilience measures. However,
+analyzing urban microclimates requires considering a complex array of outdoor
+parameters within computational domains at the city scale over a longer period
+than indoors. As a result, numerical methods like Computational Fluid Dynamics
+(CFD) become computationally expensive when evaluating the impact of urban
+microclimates. The rise of deep learning techniques has opened new
+opportunities for accelerating the modeling of complex non-linear interactions
+and system dynamics. Recently, the Fourier Neural Operator (FNO) has been shown
+to be very promising in accelerating solving the Partial Differential Equations
+(PDEs) and modeling fluid dynamic systems. In this work, we apply the FNO
+network for real-time three-dimensional (3D) urban wind field simulation. The
+training and testing data are generated from CFD simulation of the urban area,
+based on the semi-Lagrangian approach and fractional stepping method to
+simulate urban microclimate features for modeling large-scale urban problems.
+Numerical experiments show that the FNO model can accurately reconstruct the
+instantaneous spatial velocity field. We further evaluate the trained FNO model
+on unseen data with different wind directions, and the results show that the
+FNO model can generalize well on different wind directions. More importantly,
+the FNO approach can make predictions within milliseconds on the graphics
+processing unit, making real-time simulation of 3D dynamic urban microclimate
+possible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PUG: Photorealistic and Semantically Controllable Synthetic Data for
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Bordes, Shashank Shekhar, Mark Ibrahim, Diane Bouchacourt, Pascal Vincent, Ari S. Morcos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic image datasets offer unmatched advantages for designing and
+evaluating deep neural networks: they make it possible to (i) render as many
+data samples as needed, (ii) precisely control each scene and yield granular
+ground truth labels (and captions), (iii) precisely control distribution shifts
+between training and testing to isolate variables of interest for sound
+experimentation. Despite such promise, the use of synthetic image data is still
+limited -- and often played down -- mainly due to their lack of realism. Most
+works therefore rely on datasets of real images, which have often been scraped
+from public images on the internet, and may have issues with regards to
+privacy, bias, and copyright, while offering little control over how objects
+precisely appear. In this work, we present a path to democratize the use of
+photorealistic synthetic data: we develop a new generation of interactive
+environments for representation learning research, that offer both
+controllability and realism. We use the Unreal Engine, a powerful game engine
+well known in the entertainment industry, to produce PUG (Photorealistic Unreal
+Graphics) environments and datasets for representation learning. In this paper,
+we demonstrate the potential of PUG to enable more rigorous evaluations of
+vision models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning To Rank Diversely At Airbnb 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07774v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07774v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malay Haldar, Mustafa Abdool, Liwei He, Dillon Davis, Huiji Gao, Sanjeev Katariya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Airbnb is a two-sided marketplace, bringing together hosts who own listings
+for rent, with prospective guests from around the globe. Applying neural
+network-based learning to rank techniques has led to significant improvements
+in matching guests with hosts. These improvements in ranking were driven by a
+core strategy: order the listings by their estimated booking probabilities,
+then iterate on techniques to make these booking probability estimates more and
+more accurate. Embedded implicitly in this strategy was an assumption that the
+booking probability of a listing could be determined independently of other
+listings in search results. In this paper we discuss how this assumption,
+pervasive throughout the commonly-used learning to rank frameworks, is false.
+We provide a theoretical foundation correcting this assumption, followed by
+efficient neural network architectures based on the theory. Explicitly
+accounting for possible similarities between listings, and reducing them to
+diversify the search results generated strong positive impact. We discuss these
+metric wins as part of the online A/B tests of the theory. Our method provides
+a practical way to diversify search results for large-scale production ranking
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Search ranking, Diversity, e-commerce</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Data Attribution for Text-to-Image Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng-Yu Wang, Alexei A. Efros, Jun-Yan Zhu, Richard Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large text-to-image models are able to synthesize "novel" images, these
+images are necessarily a reflection of the training data. The problem of data
+attribution in such models -- which of the images in the training set are most
+responsible for the appearance of a given generated image -- is a difficult yet
+important one. As an initial step toward this problem, we evaluate attribution
+through "customization" methods, which tune an existing large-scale model
+toward a given exemplar object or style. Our key insight is that this allows us
+to efficiently create synthetic images that are computationally influenced by
+the exemplar by construction. With our new dataset of such exemplar-influenced
+images, we are able to evaluate various data attribution algorithms and
+different possible feature spaces. Furthermore, by training on our dataset, we
+can tune standard models, such as DINO, CLIP, and ViT, toward the attribution
+problem. Even though the procedure is tuned towards small exemplar sets, we
+show generalization to larger sets. Finally, by taking into account the
+inherent uncertainty of the problem, we can assign soft attribution scores over
+a set of training images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated v2 -- ICCV 2023 camera ready version. Project page:
+  https://peterwang512.github.io/GenDataAttribution Code:
+  https://github.com/PeterWang512/GenDataAttribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Calibration through Prior Adaptation for Text
+  Classification using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lautaro Estienne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide variety of natural language tasks are currently being addressed with
+large-scale language models (LLMs). These models are usually trained with a
+very large amount of unsupervised text data and adapted to perform a downstream
+natural language task using methods like fine-tuning, calibration or in-context
+learning. In this work, we propose an approach to adapt the prior class
+distribution to perform text classification tasks without the need for labelled
+samples and only few in-domain sample queries. The proposed approach treats the
+LLM as a black box, adding a stage where the model posteriors are calibrated to
+the task. Results show that these methods outperform the un-adapted model for
+different number of training shots in the prompt and a previous approach were
+calibration is performed without using any adaptation data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shuffle SGD is Always Better than SGD: Improved Analysis of SGD with
+  Arbitrary Data Orders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasia Koloskova, Nikita Doikov, Sebastian U. Stich, Martin Jaggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic Gradient Descent (SGD) algorithms are widely used in optimizing
+neural networks, with Random Reshuffling (RR) and Single Shuffle (SS) being
+popular choices for cycling through random or single permutations of the
+training data. However, the convergence properties of these algorithms in the
+non-convex case are not fully understood. Existing results suggest that, in
+realistic training scenarios where the number of epochs is smaller than the
+training set size, RR may perform worse than SGD.
+  In this paper, we analyze a general SGD algorithm that allows for arbitrary
+data orderings and show improved convergence rates for non-convex functions.
+Specifically, our analysis reveals that SGD with random and single shuffling is
+always faster or at least as good as classical SGD with replacement, regardless
+of the number of iterations. Overall, our study highlights the benefits of
+using SGD with random/single shuffling and provides new insights into its
+convergence properties for non-convex optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smoothing the Edges: A General Framework for Smooth Optimization in
+  Sparse Regularization using Hadamard Overparametrization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chris Kolb, Christian L. Müller, Bernd Bischl, David Rügamer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework for smooth optimization of objectives with
+$\ell_q$ and $\ell_{p,q}$ regularization for (structured) sparsity. Finding
+solutions to these non-smooth and possibly non-convex problems typically relies
+on specialized optimization routines. In contrast, the method studied here is
+compatible with off-the-shelf (stochastic) gradient descent that is ubiquitous
+in deep learning, thereby enabling differentiable sparse regularization without
+approximations. The proposed optimization transfer comprises an
+overparametrization of selected model parameters followed by a change of
+penalties. In the overparametrized problem, smooth and convex $\ell_2$
+regularization induces non-smooth and non-convex regularization in the original
+parametrization. We show that the resulting surrogate problem not only has an
+identical global optimum but also exactly preserves the local minima. This is
+particularly useful in non-convex regularization, where finding global
+solutions is NP-hard and local minima often generalize well. We provide an
+integrative overview that consolidates various literature strands on
+sparsity-inducing parametrizations in a general setting and meaningfully extend
+existing approaches. The feasibility of our approach is evaluated through
+numerical experiments, demonstrating its effectiveness by matching or
+outperforming common implementations of convex and non-convex regularizers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PMAA: A Progressive Multi-scale Attention Autoencoder Model for
+  High-performance Cloud Removal from Multi-temporal Satellite Imagery <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuechao Zou, Kai Li, Junliang Xing, Pin Tao, Yachao Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite imagery analysis plays a pivotal role in remote sensing; however,
+information loss due to cloud cover significantly impedes its application.
+Although existing deep cloud removal models have achieved notable outcomes,
+they scarcely consider contextual information. This study introduces a
+high-performance cloud removal architecture, termed Progressive Multi-scale
+Attention Autoencoder (PMAA), which concurrently harnesses global and local
+information to construct robust contextual dependencies using a novel
+Multi-scale Attention Module (MAM) and a novel Local Interaction Module (LIM).
+PMAA establishes long-range dependencies of multi-scale features using MAM and
+modulates the reconstruction of fine-grained details utilizing LIM, enabling
+simultaneous representation of fine- and coarse-grained features at the same
+level. With the help of diverse and multi-scale features, PMAA consistently
+outperforms the previous state-of-the-art model CTGAN on two benchmark
+datasets. Moreover, PMAA boasts considerable efficiency advantages, with only
+0.5% and 14.6% of the parameters and computational complexity of CTGAN,
+respectively. These comprehensive results underscore PMAA's potential as a
+lightweight cloud removal network suitable for deployment on edge devices to
+accomplish large-scale cloud removal tasks. Our source code and pre-trained
+models are available at https://github.com/XavierJiezou/PMAA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ P-NOC: Adversarial CAM Generation for Weakly Supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas David, Helio Pedrini, Zanoni Dias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate the necessity for large amounts of supervised segmentation
+annotation sets, multiple Weakly Supervised Semantic Segmentation (WSSS)
+strategies have been devised. These will often rely on advanced data and model
+regularization strategies to instigate the development of useful properties
+(e.g., prediction completeness and fidelity to semantic boundaries) in
+segmentation priors, notwithstanding the lack of annotated information. In this
+work, we first create a strong baseline by analyzing complementary WSSS
+techniques and regularizing strategies, considering their strengths and
+limitations. We then propose a new Class-specific Adversarial Erasing strategy,
+comprising two adversarial CAM generating networks being gradually refined to
+produce robust semantic segmentation proposals. Empirical results suggest that
+our approach induces substantial improvement in the effectiveness of the
+baseline, resulting in a noticeable improvement over both Pascal VOC 2012 and
+MS COCO 2014 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Does Little Robustness Help? Understanding Adversarial
+  Transferability From Surrogate Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07873v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07873v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yechao Zhang, Shengshan Hu, Leo Yu Zhang, Junyu Shi, Minghui Li, Xiaogeng Liu, Wei Wan, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs
+that successfully fool white-box surrogate models can also deceive other
+black-box models with different architectures. Although a bunch of empirical
+studies have provided guidance on generating highly transferable AEs, many of
+these findings lack explanations and even lead to inconsistent advice. In this
+paper, we take a further step towards understanding adversarial
+transferability, with a particular focus on surrogate aspects. Starting from
+the intriguing little robustness phenomenon, where models adversarially trained
+with mildly perturbed adversarial samples can serve as better surrogates, we
+attribute it to a trade-off between two predominant factors: model smoothness
+and gradient similarity. Our investigations focus on their joint effects,
+rather than their separate correlations with transferability. Through a series
+of theoretical and empirical analyses, we conjecture that the data distribution
+shift in adversarial training explains the degradation of gradient similarity.
+Building on these insights, we explore the impacts of data augmentation and
+gradient regularization on transferability and identify that the trade-off
+generally exists in the various training mechanisms, thus building a
+comprehensive blueprint for the regulation mechanism behind transferability.
+Finally, we provide a general route for constructing better surrogates to boost
+transferability which optimizes both model smoothness and gradient similarity
+simultaneously, e.g., the combination of input gradient regularization and
+sharpness-aware minimization (SAM), validated by extensive experiments. In
+summary, we call for attention to the united impacts of these two factors for
+launching effective transfer attacks, rather than optimizing one while ignoring
+the other, and emphasize the crucial role of manipulating surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21
+  pages, 11 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MonoFlow: Rethinking Divergence GANs via the Perspective of Wasserstein
+  Gradient Flows <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01075v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01075v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxuan Yi, Zhanxing Zhu, Song Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional understanding of adversarial training in generative
+adversarial networks (GANs) is that the discriminator is trained to estimate a
+divergence, and the generator learns to minimize this divergence. We argue that
+despite the fact that many variants of GANs were developed following this
+paradigm, the current theoretical understanding of GANs and their practical
+algorithms are inconsistent. In this paper, we leverage Wasserstein gradient
+flows which characterize the evolution of particles in the sample space, to
+gain theoretical insights and algorithmic inspiration of GANs. We introduce a
+unified generative modeling framework - MonoFlow: the particle evolution is
+rescaled via a monotonically increasing mapping of the log density ratio. Under
+our framework, adversarial training can be viewed as a procedure first
+obtaining MonoFlow's vector field via training the discriminator and the
+generator learns to draw the particle flow defined by the corresponding vector
+field. We also reveal the fundamental difference between variational divergence
+minimization and adversarial training. This analysis helps us to identify what
+types of generator loss functions can lead to the successful training of GANs
+and suggest that GANs may have more loss designs beyond the literature (e.g.,
+non-saturated loss), as long as they realize MonoFlow. Consistent empirical
+studies are included to validate the effectiveness of our framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Set-based value operators for non-stationary Markovian environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.07271v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.07271v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah H. Q. Li, Assalé Adjé, Pierre-Loïc Garoche, Behçet Açıkmeşe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper analyzes finite state Markov Decision Processes (MDPs) with
+uncertain parameters in compact sets and re-examines results from robust MDP
+via set-based fixed point theory. To this end, we generalize the Bellman and
+policy evaluation operators to contracting operators on the value function
+space and denote them as \emph{value operators}. We lift these value operators
+to act on \emph{sets} of value functions and denote them as \emph{set-based
+value operators}. We prove that the set-based value operators are
+\emph{contractions} in the space of compact value function sets. Leveraging
+insights from set theory, we generalize the rectangularity condition in classic
+robust MDP literature to a containment condition for all value operators, which
+is weaker and can be applied to a larger set of parameter-uncertain MDPs and
+contracting operators in dynamic programming. We prove that both the
+rectangularity condition and the containment condition sufficiently ensure that
+the set-based value operator's fixed point set contains its own extrema
+elements. For convex and compact sets of uncertain MDP parameters, we show
+equivalence between the classic robust value function and the supremum of the
+fixed point set of the set-based Bellman operator. Under dynamically changing
+MDP parameters in compact sets, we prove a set convergence result for value
+iteration, which otherwise may not converge to a single value function.
+Finally, we derive novel guarantees for probabilistic path-planning problems in
+planet exploration and stratospheric station-keeping.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 11 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inherently Interpretable Multi-Label Classification Using Class-Specific
+  Counterfactuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Stefano Woerner, Andreas Maier, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability is essential for machine learning algorithms in high-stakes
+application fields such as medical image analysis. However, high-performing
+black-box neural networks do not provide explanations for their predictions,
+which can lead to mistrust and suboptimal human-ML collaboration. Post-hoc
+explanation techniques, which are widely used in practice, have been shown to
+suffer from severe conceptual problems. Furthermore, as we show in this paper,
+current explanation techniques do not perform adequately in the multi-label
+scenario, in which multiple medical findings may co-occur in a single image. We
+propose Attri-Net, an inherently interpretable model for multi-label
+classification. Attri-Net is a powerful classifier that provides transparent,
+trustworthy, and human-understandable explanations. The model first generates
+class-specific attribution maps based on counterfactuals to identify which
+image regions correspond to certain medical findings. Then a simple logistic
+regression classifier is used to make predictions based solely on these
+attribution maps. We compare Attri-Net to five post-hoc explanation techniques
+and one inherently interpretable classifier on three chest X-ray datasets. We
+find that Attri-Net produces high-quality multi-label explanations consistent
+with clinical knowledge and has comparable classification performance to
+state-of-the-art classification models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MIDL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical and Rigorous Uncertainty Bounds for Gaussian Process
+  Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.02796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.02796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Fiedler, Carsten W. Scherer, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian Process Regression is a popular nonparametric regression method
+based on Bayesian principles that provides uncertainty estimates for its
+predictions. However, these estimates are of a Bayesian nature, whereas for
+some important applications, like learning-based control with safety
+guarantees, frequentist uncertainty bounds are required. Although such rigorous
+bounds are available for Gaussian Processes, they are too conservative to be
+useful in applications. This often leads practitioners to replacing these
+bounds by heuristics, thus breaking all theoretical guarantees. To address this
+problem, we introduce new uncertainty bounds that are rigorous, yet practically
+useful at the same time. In particular, the bounds can be explicitly evaluated
+and are much less conservative than state of the art results. Furthermore, we
+show that certain model misspecifications lead to only graceful degradation. We
+demonstrate these advantages and the usefulness of our results for
+learning-based control with numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Contains supplementary material and corrections to the original
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Genie: Show Me the Data for Quantization <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkweon Jeon, Chungman Lee, Ho-young Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot quantization is a promising approach for developing lightweight
+deep neural networks when data is inaccessible owing to various reasons,
+including cost and issues related to privacy. By exploiting the learned
+parameters ($\mu$ and $\sigma$) of batch normalization layers in an
+FP32-pre-trained model, zero-shot quantization schemes focus on generating
+synthetic data. Subsequently, they distill knowledge from the pre-trained model
+(teacher) to the quantized model (student) such that the quantized model can be
+optimized with the synthetic dataset. However, thus far, zero-shot quantization
+has primarily been discussed in the context of quantization-aware training
+methods, which require task-specific losses and long-term optimization as much
+as retraining. We thus introduce a post-training quantization scheme for
+zero-shot quantization that produces high-quality quantized networks within a
+few hours. Furthermore, we propose a framework called Genie~that generates data
+suited for quantization. With the data synthesized by Genie, we can produce
+robust quantized models without real datasets, which is comparable to few-shot
+quantization. We also propose a post-training quantization algorithm to enhance
+the performance of quantized models. By combining them, we can bridge the gap
+between zero-shot and few-shot quantization while significantly improving the
+quantization performance compared to that of existing approaches. In other
+words, we can obtain a unique state-of-the-art zero-shot quantization approach.
+The code is available at \url{https://github.com/SamsungLabs/Genie}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2023, https://github.com/SamsungLabs/Genie</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine learning for rapid discovery of laminar flow channel wall
+  modifications that enhance heat transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2101.08130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2101.08130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuri Koide, Arjun J. Kaithakkal, Matthias Schniewind, Bradley P. Ladewig, Alexander Stroh, Pascal Friederich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical simulation of fluids plays an essential role in modeling many
+physical phenomena, which enables technological advancements, contributes to
+sustainable practices, and expands our understanding of various natural and
+engineered systems. The calculation of heat transfer in fluid flow in simple
+flat channels is a relatively easy task for various simulation methods.
+However, once the channel geometry becomes more complex, numerical simulations
+become a bottleneck in optimizing wall geometries. We present a combination of
+accurate numerical simulations of arbitrary, flat, and non-flat channels and
+machine learning models predicting drag coefficient and Stanton number. We show
+that convolutional neural networks (CNN) can accurately predict the target
+properties at a fraction of the time of numerical simulations. We use the CNN
+models in a virtual high-throughput screening approach to explore a large
+number of possible, randomly generated wall architectures. Data Augmentation
+was applied to existing geometries data to add generated new training data
+which have the same number of parameters of heat transfer to improve the
+model's generalization. The general approach is not only applicable to simple
+flow setups as presented here but can be extended to more complex tasks, such
+as multiphase or even reactive unit operations in chemical engineering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing CLIP with <span class="highlight-title">GPT</span>-4: Harnessing Visual Descriptions as <span class="highlight-title">Prompt</span>s <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayug Maniparambil, Chris Vorster, Derek Molloy, Noel Murphy, Kevin McGuinness, Noel E. O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive pretrained large Vision-Language Models (VLMs) like CLIP have
+revolutionized visual representation learning by providing good performance on
+downstream datasets. VLMs are 0-shot adapted to a downstream dataset by
+designing prompts that are relevant to the dataset. Such prompt engineering
+makes use of domain expertise and a validation dataset. Meanwhile, recent
+developments in generative pretrained models like GPT-4 mean they can be used
+as advanced internet search tools. They can also be manipulated to provide
+visual information in any structure. In this work, we show that GPT-4 can be
+used to generate text that is visually descriptive and how this can be used to
+adapt CLIP to downstream tasks. We show considerable improvements in 0-shot
+transfer accuracy on specialized fine-grained datasets like EuroSAT (~7%), DTD
+(~7%), SUN397 (~4.6%), and CUB (~3.3%) when compared to CLIP's default prompt.
+We also design a simple few-shot adapter that learns to choose the best
+possible sentences to construct generalizable classifiers that outperform the
+recently proposed CoCoOP by ~2% on average and by over 4% on 4 specialized
+fine-grained datasets. The code, prompts, and auxiliary text dataset is
+available at https://github.com/mayug/VDT-Adapter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at ICCV-W 2023. V2 contains additional comparisons
+  with concurrent works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling Face Verification Edge Cases: In-Depth Analysis and
+  Human-Machine Fusion Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08134v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08134v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Knoche, Gerhard Rigoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, face recognition systems surpass human performance on several
+datasets. However, there are still edge cases that the machine can't correctly
+classify. This paper investigates the effect of a combination of machine and
+human operators in the face verification task. First, we look closer at the
+edge cases for several state-of-the-art models to discover common datasets'
+challenging settings. Then, we conduct a study with 60 participants on these
+selected tasks with humans and provide an extensive analysis. Finally, we
+demonstrate that combining machine and human decisions can further improve the
+performance of state-of-the-art face verification systems on various benchmark
+datasets. Code and data are publicly available on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A semantic backdoor attack against Graph Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14353v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14353v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhu Dai, Zhipeng Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) have been very effective in addressing
+the issue of various graph-structured related tasks, such as node
+classification and graph classification. However, recent research has shown
+that GCNs are vulnerable to a new type of threat called a backdoor attack,
+where the adversary can inject a hidden backdoor into GCNs so that the attacked
+model performs well on benign samples, but its prediction will be maliciously
+changed to the attacker-specified target label if the hidden backdoor is
+activated by the attacker-defined trigger. In this paper, we investigate
+whether such semantic backdoor attacks are possible for GCNs and propose a
+semantic backdoor attack against GCNs (SBAG) under the context of graph
+classification to reveal the existence of this security vulnerability in GCNs.
+SBAG uses a certain type of node in the samples as a backdoor trigger and
+injects a hidden backdoor into GCN models by poisoning training data. The
+backdoor will be activated, and the GCN models will give malicious
+classification results specified by the attacker even on unmodified samples as
+long as the samples contain enough trigger nodes. We evaluate SBAG on four
+graph datasets. The experimental results indicate that SBAG can achieve attack
+success rates of approximately 99.9% and over 82% for two kinds of attack
+samples, respectively, with poisoning rates of less than 5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Memory Recursive Least Squares: Recast Forgetting into Memory
+  in RBF Neural Network Based Real-Time Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Fei, Jiangang Li, Yanan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In radial basis function neural network (RBFNN) based real-time learning
+tasks, forgetting mechanisms are widely used such that the neural network can
+keep its sensitivity to new data. However, with forgetting mechanisms, some
+useful knowledge will get lost simply because they are learned a long time ago,
+which we refer to as the passive knowledge forgetting phenomenon. To address
+this problem, this paper proposes a real-time training method named selective
+memory recursive least squares (SMRLS) in which the classical forgetting
+mechanisms are recast into a memory mechanism. Different from the forgetting
+mechanism, which mainly evaluates the importance of samples according to the
+time when samples are collected, the memory mechanism evaluates the importance
+of samples through both temporal and spatial distribution of samples. With
+SMRLS, the input space of the RBFNN is evenly divided into a finite number of
+partitions and a synthesized objective function is developed using synthesized
+samples from each partition. In addition to the current approximation error,
+the neural network also updates its weights according to the recorded data from
+the partition being visited. Compared with classical training methods including
+the forgetting factor recursive least squares (FFRLS) and stochastic gradient
+descent (SGD) methods, SMRLS achieves improved learning speed and
+generalization capability, which are demonstrated by corresponding simulation
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Bayesian Networks with Annealing Machine <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2006.06926v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2006.06926v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuta Shikuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have reported that annealing machines are capable of solving
+combinatorial optimization problems with high accuracy. Annealing machines can
+potentially be applied to score-based Bayesian network structure learning.
+However, the bit capacity of an annealing machine is currently limited. To
+utilize the annealing technology, converting score-based learning problems into
+quadratic unconstrained binary optimizations within the bit capacity is
+necessary. In this paper, we propose an efficient conversion method with the
+advanced identification of candidate parent sets and their decomposition. We
+also provide an integer programming problem to find the decomposition that
+minimizes the number of required bits. Experimental results on $7$ benchmark
+datasets with variables from $75$ to $223$ show that our approach requires less
+bits than the $100$K bit capacity of the fourth-generation Fujitsu Digital
+Annealer, a fully coupled annealing machine developed with semiconductor
+technology. Moreover, we demonstrate that the Digital Annealer with our
+conversion method outperforms existing algorithms on score maximization. These
+results highlight the utility of annealing processors in learning Bayesian
+networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 tables, 3 figures, NeurIPS 2023 (under review)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Treat Different Negatives Differently: Enriching Loss Functions with
+  Domain and Range Constraints for Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Hubert, Pierre Monnin, Armelle Brun, Davy Monticolo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding models (KGEMs) are used for various tasks related
+to knowledge graphs (KGs), including link prediction. They are trained with
+loss functions that are computed considering a batch of scored triples and
+their corresponding labels. Traditional approaches consider the label of a
+triple to be either true or false. However, recent works suggest that all
+negative triples should not be valued equally. In line with this recent
+assumption, we posit that negative triples that are semantically valid w.r.t.
+domain and range constraints might be high-quality negative triples. As such,
+loss functions should treat them differently from semantically invalid negative
+ones. To this aim, we propose semantic-driven versions for the three main loss
+functions for link prediction. In an extensive and controlled experimental
+setting, we show that the proposed loss functions systematically provide
+satisfying results on three public benchmark KGs underpinned with different
+schemas, which demonstrates both the generality and superiority of our proposed
+approach. In fact, the proposed loss functions do (1) lead to better MRR and
+Hits@10 values, (2) drive KGEMs towards better semantic awareness as measured
+by the Sem@K metric. This highlights that semantic information globally
+improves KGEMs, and thus should be incorporated into loss functions. Domains
+and ranges of relations being largely available in schema-defined KGs, this
+makes our approach both beneficial and widely usable in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Composite Goodness-of-fit Tests with Kernels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.10275v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.10275v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Key, Arthur Gretton, François-Xavier Briol, Tamara Fernandez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model misspecification can create significant challenges for the
+implementation of probabilistic models, and this has led to development of a
+range of robust methods which directly account for this issue. However, whether
+these more involved methods are required will depend on whether the model is
+really misspecified, and there is a lack of generally applicable methods to
+answer this question. In this paper, we propose one such method. More
+precisely, we propose kernel-based hypothesis tests for the challenging
+composite testing problem, where we are interested in whether the data comes
+from any distribution in some parametric family. Our tests make use of minimum
+distance estimators based on the maximum mean discrepancy and the kernel Stein
+discrepancy. They are widely applicable, including whenever the density of the
+parametric model is known up to normalisation constant, or if the model takes
+the form of a simulator. As our main result, we show that we are able to
+estimate the parameter and conduct our test on the same data (without data
+splitting), while maintaining a correct test level. Our approach is illustrated
+on a range of problems, including testing for goodness-of-fit of an
+unnormalised non-parametric density model, and an intractable generative model
+of a biological cellular network.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiking Neural Networks for event-based action recognition: A new task
+  to understand their advantage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Vicente-Sola, Davide L. Manna, Paul Kirkland, Gaetano Di Caterina, Trevor Bihl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNN) are characterised by their unique temporal
+dynamics, but the properties and advantages of such computations are still not
+well understood. In order to provide answers, in this work we demonstrate how
+Spiking neurons can enable temporal feature extraction in feed-forward neural
+networks without the need for recurrent synapses, showing how their
+bio-inspired computing principles can be successfully exploited beyond energy
+efficiency gains and evidencing their differences with respect to conventional
+neurons. This is demonstrated by proposing a new task, DVS-Gesture-Chain
+(DVS-GC), which allows, for the first time, to evaluate the perception of
+temporal dependencies in a real event-based action recognition dataset. Our
+study proves how the widely used DVS Gesture benchmark could be solved by
+networks without temporal feature extraction, unlike the new DVS-GC which
+demands an understanding of the ordering of the events. Furthermore, this setup
+allowed us to unveil the role of the leakage rate in spiking neurons for
+temporal processing tasks and demonstrated the benefits of "hard reset"
+mechanisms. Additionally, we also show how time-dependent weights and
+normalization can lead to understanding order by means of temporal attention.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New article superseding the one in previous versions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Diffusion Probabilistic Models for Generation of Realistic
+  Fully-Annotated Microscopy Image Data Sets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dennis Eschweiler, Rüveyda Yilmaz, Matisse Baumann, Ina Laube, Rijo Roy, Abin Jose, Daniel Brückner, Johannes Stegmaier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in computer vision have led to significant progress in the
+generation of realistic image data, with denoising diffusion probabilistic
+models proving to be a particularly effective method. In this study, we
+demonstrate that diffusion models can effectively generate fully-annotated
+microscopy image data sets through an unsupervised and intuitive approach,
+using rough sketches of desired structures as the starting point. The proposed
+pipeline helps to reduce the reliance on manual annotations when training deep
+learning-based segmentation approaches and enables the segmentation of diverse
+datasets without the need for human annotations. This approach holds great
+promise in streamlining the data generation process and enabling a more
+efficient and scalable training of segmentation models, as we show in the
+example of different practical experiments involving various organisms and cell
+types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Co-manipulation of soft-materials estimating deformation from depth
+  images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05609v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05609v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Nicola, Enrico Villagrossi, Nicola Pedrocchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-robot co-manipulation of soft materials, such as fabrics, composites,
+and sheets of paper/cardboard, is a challenging operation that presents several
+relevant industrial applications. Estimating the deformation state of the
+co-manipulated material is one of the main challenges. Viable methods provide
+the indirect measure by calculating the human-robot relative distance. In this
+paper, we develop a data-driven model to estimate the deformation state of the
+material from a depth image through a Convolutional Neural Network (CNN).
+First, we define the deformation state of the material as the relative
+roto-translation from the current robot pose and a human grasping position. The
+model estimates the current deformation state through a Convolutional Neural
+Network, specifically a DenseNet-121 pretrained on ImageNet.The delta between
+the current and the desired deformation state is fed to the robot controller
+that outputs twist commands. The paper describes the developed approach to
+acquire, preprocess the dataset and train the model. The model is compared with
+the current state-of-the-art method based on a skeletal tracker from cameras.
+Results show that our approach achieves better performances and avoids the
+various drawbacks caused by using a skeletal tracker.Finally, we also studied
+the model performance according to different architectures and dataset
+dimensions to minimize the time required for dataset acquisition
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print, Accepted to Robotics and Computer Integrated Manufacturing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generation of Realistic Synthetic Raw Radar Data for Automated Driving
+  Applications using Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo C. Fidelis, Fabio Reway, Herick Y. S. Ribeiro, Pietro L. Campos, Werner Huber, Christian Icking, Lester A. Faria, Torsten Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main approaches for simulating FMCW radar are based on ray tracing, which
+is usually computationally intensive and do not account for background noise.
+This work proposes a faster method for FMCW radar simulation capable of
+generating synthetic raw radar data using generative adversarial networks
+(GAN). The code and pre-trained weights are open-source and available on
+GitHub. This method generates 16 simultaneous chirps, which allows the
+generated data to be used for the further development of algorithms for
+processing radar data (filtering and clustering). This can increase the
+potential for data augmentation, e.g., by generating data in non-existent or
+safety-critical scenarios that are not reproducible in real life. In this work,
+the GAN was trained with radar measurements of a motorcycle and used to
+generate synthetic raw radar data of a motorcycle traveling in a straight line.
+For generating this data, the distance of the motorcycle and Gaussian noise are
+used as input to the neural network. The synthetic generated radar chirps were
+evaluated using the Frechet Inception Distance (FID). Then, the Range-Azimuth
+(RA) map is calculated twice: first, based on synthetic data using this GAN
+and, second, based on real data. Based on these RA maps, an algorithm with
+adaptive threshold and edge detection is used for object detection. The results
+have shown that the data is realistic in terms of coherent radar reflections of
+the motorcycle and background noise based on the comparison of chirps, the RA
+maps and the object detection results. Thus, the proposed method in this work
+has shown to minimize the simulation-to-reality gap for the generation of radar
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GCformer: An Efficient Framework for Accurate and Scalable Long-Term
+  Multivariate Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08325v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08325v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        YanJun Zhao, Ziqing Ma, Tian Zhou, Liang Sun, Mengni Ye, Yi Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have emerged as promising tools for time series
+forecasting.
+  However, these model cannot make accurate prediction for long input time
+series. On the one hand, they failed to capture global dependencies within time
+series data. On the other hand, the long input sequence usually leads to large
+model size and high time complexity.
+  To address these limitations, we present GCformer, which combines a
+structured global convolutional branch for processing long input sequences with
+a local Transformer-based branch for capturing short, recent signals. A
+cohesive framework for a global convolution kernel has been introduced,
+utilizing three distinct parameterization methods. The selected structured
+convolutional kernel in the global branch has been specifically crafted with
+sublinear complexity, thereby allowing for the efficient and effective
+processing of lengthy and noisy input signals. Empirical studies on six
+benchmark datasets demonstrate that GCformer outperforms state-of-the-art
+methods, reducing MSE error in multivariate time series benchmarks by 4.38% and
+model parameters by 61.92%. In particular, the global convolutional branch can
+serve as a plug-in block to enhance the performance of other models, with an
+average improvement of 31.93\%, including various recently published
+Transformer-based models. Our code is publicly available at
+https://github.com/zyj-111/GCformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapt and Decompose: Efficient Generalization of Text-to-SQL via Domain
+  Adapted Least-To-Most <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aseem Arora, Shabbirhussain Bhaisaheb, Manasi Patwardhan, Lovekesh Vig, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain and cross-compositional generalization of Text-to-SQL semantic
+parsing is a challenging task. Existing Large Language Model (LLM) based
+solutions rely on inference-time retrieval of few-shot exemplars from the
+training set to synthesize a run-time prompt for each Natural Language (NL)
+test query. In contrast, we devise an algorithm which performs offline sampling
+of a minimal set-of few-shots from the training data, with complete coverage of
+SQL clauses, operators and functions, and maximal domain coverage within the
+allowed token length. This allows for synthesis of a fixed Generic Prompt (GP),
+with a diverse set-of exemplars common across NL test queries, avoiding
+expensive test time exemplar retrieval. We further auto-adapt the GP to the
+target database domain (DA-GP), to better handle cross-domain generalization;
+followed by a decomposed Least-To-Most-Prompting (LTMP-DA-GP) to handle
+cross-compositional generalization. The synthesis of LTMP-DA-GP is an offline
+task, to be performed one-time per new database with minimal human
+intervention. Our approach demonstrates superior performance on the KaggleDBQA
+dataset, designed to evaluate generalizability for the Text-to-SQL task. We
+further showcase consistent performance improvement of LTMP-DA-GP over GP,
+across LLMs and databases of KaggleDBQA, highlighting the efficacy and model
+agnostic benefits of our prompt based adapt and decompose approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORC: Network Group-based Knowledge Distillation using Online Role Change <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.01186v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.01186v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyong Choi, Hyeon Cho, Seokhwa Cheung, Wonjun Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In knowledge distillation, since a single, omnipotent teacher network cannot
+solve all problems, multiple teacher-based knowledge distillations have been
+studied recently. However, sometimes their improvements are not as good as
+expected because some immature teachers may transfer the false knowledge to the
+student. In this paper, to overcome this limitation and take the efficacy of
+the multiple networks, we divide the multiple networks into teacher and student
+groups, respectively. That is, the student group is a set of immature networks
+that require learning the teacher's knowledge, while the teacher group consists
+of the selected networks that are capable of teaching successfully. We propose
+our online role change strategy where the top-ranked networks in the student
+group are able to promote to the teacher group at every iteration. After
+training the teacher group using the error samples of the student group to
+refine the teacher group's knowledge, we transfer the collaborative knowledge
+from the teacher group to the student group successfully. We verify the
+superiority of the proposed method on CIFAR-10, CIFAR-100, and ImageNet which
+achieves high performance. We further show the generality of our method with
+various backbone architectures such as ResNet, WRN, VGG, Mobilenet, and
+Shufflenet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023; Supplementary material would be found at CVF
+  Open Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UMD: Unsupervised Model Detection for X2X Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18651v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18651v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xiang, Zidi Xiong, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor (Trojan) attack is a common threat to deep neural networks, where
+samples from one or more source classes embedded with a backdoor trigger will
+be misclassified to adversarial target classes. Existing methods for detecting
+whether a classifier is backdoor attacked are mostly designed for attacks with
+a single adversarial target (e.g., all-to-one attack). To the best of our
+knowledge, without supervision, no existing methods can effectively address the
+more general X2X attack with an arbitrary number of source classes, each paired
+with an arbitrary target class. In this paper, we propose UMD, the first
+Unsupervised Model Detection method that effectively detects X2X backdoor
+attacks via a joint inference of the adversarial (source, target) class pairs.
+In particular, we first define a novel transferability statistic to measure and
+select a subset of putative backdoor class pairs based on a proposed clustering
+approach. Then, these selected class pairs are jointly assessed based on an
+aggregation of their reverse-engineered trigger size for detection inference,
+using a robust and unsupervised anomaly detector we proposed. We conduct
+comprehensive evaluations on CIFAR-10, GTSRB, and Imagenette dataset, and show
+that our unsupervised UMD outperforms SOTA detectors (even with supervision) by
+17%, 4%, and 8%, respectively, in terms of the detection accuracy against
+diverse X2X attacks. We also show the strong detection performance of UMD
+against several strong adaptive attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 40th International Conference on Machine Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discriminator optimal transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1910.06832v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1910.06832v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akinori Tanaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within a broad class of generative adversarial networks, we show that
+discriminator optimization process increases a lower bound of the dual cost
+function for the Wasserstein distance between the target distribution $p$ and
+the generator distribution $p_G$. It implies that the trained discriminator can
+approximate optimal transport (OT) from $p_G$ to $p$.Based on some experiments
+and a bit of OT theory, we propose a discriminator optimal transport (DOT)
+scheme to improve generated images. We show that it improves inception score
+and FID calculated by un-conditional GAN trained by CIFAR-10, STL-10 and a
+public pre-trained model of conditional GAN by ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>math errors corrected, note added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Coreset Selection for Efficient Robust Training <span class="chip">ECCV2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi M. Dolatabadi, Sarah Erfani, Christopher Leckie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are vulnerable to adversarial attacks: adding well-crafted,
+imperceptible perturbations to their input can modify their output. Adversarial
+training is one of the most effective approaches to training robust models
+against such attacks. Unfortunately, this method is much slower than vanilla
+training of neural networks since it needs to construct adversarial examples
+for the entire training data at every iteration. By leveraging the theory of
+coreset selection, we show how selecting a small subset of training data
+provides a principled approach to reducing the time complexity of robust
+training. To this end, we first provide convergence guarantees for adversarial
+coreset selection. In particular, we show that the convergence bound is
+directly related to how well our coresets can approximate the gradient computed
+over the entire training data. Motivated by our theoretical analysis, we
+propose using this gradient approximation error as our adversarial coreset
+selection objective to reduce the training set size effectively. Once built, we
+run adversarial training over this subset of the training data. Unlike existing
+methods, our approach can be adapted to a wide variety of training objectives,
+including TRADES, $\ell_p$-PGD, and Perceptual Adversarial Training. We conduct
+extensive experiments to demonstrate that our approach speeds up adversarial
+training by 2-3 times while experiencing a slight degradation in the clean and
+robust accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the International Journal of Computer Vision (IJCV).
+  Extended version of the ECCV2022 paper: arXiv:2112.00378. arXiv admin note:
+  substantial text overlap with arXiv:2112.00378</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tailed Recognition by Mutual Information Maximization between
+  Latent Features and Ground-Truth Labels <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min-Kook Suh, Seung-Woo Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although contrastive learning methods have shown prevailing performance on a
+variety of representation learning tasks, they encounter difficulty when the
+training dataset is long-tailed. Many researchers have combined contrastive
+learning and a logit adjustment technique to address this problem, but the
+combinations are done ad-hoc and a theoretical background has not yet been
+provided. The goal of this paper is to provide the background and further
+improve the performance. First, we show that the fundamental reason contrastive
+learning methods struggle with long-tailed tasks is that they try to maximize
+the mutual information maximization between latent features and input data. As
+ground-truth labels are not considered in the maximization, they are not able
+to address imbalances between class labels. Rather, we interpret the
+long-tailed recognition task as a mutual information maximization between
+latent features and ground-truth labels. This approach integrates contrastive
+learning and logit adjustment seamlessly to derive a loss function that shows
+state-of-the-art performance on long-tailed recognition benchmarks. It also
+demonstrates its efficacy in image segmentation tasks, verifying its
+versatility beyond image classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auto-Encoding Adversarial Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11004v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11004v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaifeng Zhang, Rui Zhao, Ziming Zhang, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) provides a powerful framework for
+decision-making, but its application in practice often requires a carefully
+designed reward function. Adversarial Imitation Learning (AIL) sheds light on
+automatic policy acquisition without access to the reward signal from the
+environment. In this work, we propose Auto-Encoding Adversarial Imitation
+Learning (AEAIL), a robust and scalable AIL framework. To induce expert
+policies from demonstrations, AEAIL utilizes the reconstruction error of an
+auto-encoder as a reward signal, which provides more information for optimizing
+policies than the prior discriminator-based ones. Subsequently, we use the
+derived objective functions to train the auto-encoder and the agent policy.
+Experiments show that our AEAIL performs superior compared to state-of-the-art
+methods on both state and image based environments. More importantly, AEAIL
+shows much better robustness when the expert demonstrations are noisy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Communication-Efficient Framework for Distributed Image Semantic
+  Wireless Transmission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingyan Xie, Yongpeng Wu, Yuxuan Shi, Derrick Wing Kwan Ng, Wenjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-node communication, which refers to the interaction among multiple
+devices, has attracted lots of attention in many Internet-of-Things (IoT)
+scenarios. However, its huge amounts of data flows and inflexibility for task
+extension have triggered the urgent requirement of communication-efficient
+distributed data transmission frameworks. In this paper, inspired by the great
+superiorities on bandwidth reduction and task adaptation of semantic
+communications, we propose a federated learning-based semantic communication
+(FLSC) framework for multi-task distributed image transmission with IoT
+devices. Federated learning enables the design of independent semantic
+communication link of each user while further improves the semantic extraction
+and task performance through global aggregation. Each link in FLSC is composed
+of a hierarchical vision transformer (HVT)-based extractor and a task-adaptive
+translator for coarse-to-fine semantic extraction and meaning translation
+according to specific tasks. In order to extend the FLSC into more realistic
+conditions, we design a channel state information-based multiple-input
+multiple-output transmission module to combat channel fading and noise.
+Simulation results show that the coarse semantic information can deal with a
+range of image-level tasks. Moreover, especially in low signal-to-noise ratio
+and channel bandwidth ratio regimes, FLSC evidently outperforms the traditional
+scheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel
+condition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Regularized Learning for Linear-functional Data in Banach
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.03159v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.03159v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we study the whole theory of regularized learning for
+linear-functional data in Banach spaces including representer theorems,
+pseudo-approximation theorems, and convergence theorems. The input training
+data are composed of linear functionals in the predual space of the Banach
+space to represent the discrete local information of multimodel data and
+multiscale models. The training data and the multi-loss functions are used to
+compute the empirical risks to approximate the expected risks, and the
+regularized learning is to minimize the regularized empirical risks over the
+Banach spaces. The exact solutions of the original problems are approximated
+globally by the regularized learning even if the original problems are unknown
+or unformulated. In the convergence theorems, we show the convergence of the
+approximate solutions to the exact solutions by the weak* topology of the
+Banach space. Moreover, the theorems of the regularized learning are applied to
+solve many problems of machine learning such as support vector machines and
+artificial neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>53 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Weight Prediction Boosts the Convergence of AdamW 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00195v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00195v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce weight prediction into the AdamW optimizer to
+boost its convergence when training the deep neural network (DNN) models. In
+particular, ahead of each mini-batch training, we predict the future weights
+according to the update rule of AdamW and then apply the predicted future
+weights to do both forward pass and backward propagation. In this way, the
+AdamW optimizer always utilizes the gradients w.r.t. the future weights instead
+of current weights to update the DNN parameters, making the AdamW optimizer
+achieve better convergence. Our proposal is simple and straightforward to
+implement but effective in boosting the convergence of DNN training. We
+performed extensive experimental evaluations on image classification and
+language modeling tasks to verify the effectiveness of our proposal. The
+experimental results validate that our proposal can boost the convergence of
+AdamW and achieve better accuracy than AdamW when training the DNN models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SecureBoost Hyperparameter Tuning via Multi-Objective Federated Learning <span class="chip">IJCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10579v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10579v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Ren, Yan Kang, Lixin Fan, Linghua Yang, Yongxin Tong, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  SecureBoost is a tree-boosting algorithm leveraging homomorphic encryption to
+protect data privacy in vertical federated learning setting. It is widely used
+in fields such as finance and healthcare due to its interpretability,
+effectiveness, and privacy-preserving capability. However, SecureBoost suffers
+from high computational complexity and risk of label leakage. To harness the
+full potential of SecureBoost, hyperparameters of SecureBoost should be
+carefully chosen to strike an optimal balance between utility, efficiency, and
+privacy. Existing methods either set hyperparameters empirically or
+heuristically, which are far from optimal. To fill this gap, we propose a
+Constrained Multi-Objective SecureBoost (CMOSB) algorithm to find Pareto
+optimal solutions that each solution is a set of hyperparameters achieving
+optimal tradeoff between utility loss, training cost, and privacy leakage. We
+design measurements of the three objectives. In particular, the privacy leakage
+is measured using our proposed instance clustering attack. Experimental results
+demonstrate that the CMOSB yields not only hyperparameters superior to the
+baseline but also optimal sets of hyperparameters that can support the flexible
+requirements of FL participants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FL-IJCAI'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GNNBuilder: An Automated Framework for Generic Graph Neural Network
+  Accelerator Generation, Simulation, and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16459v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16459v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Abi-Karam, Cong Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are plenty of graph neural network (GNN) accelerators being proposed.
+However, they highly rely on users' hardware expertise and are usually
+optimized for one specific GNN model, making them challenging for practical
+use. Therefore, in this work, we propose GNNBuilder, the first automated,
+generic, end-to-end GNN accelerator generation framework. It features four
+advantages: (1) GNNBuilder can automatically generate GNN accelerators for a
+wide range of GNN models arbitrarily defined by users; (2) GNNBuilder takes
+standard PyTorch programming interface, introducing zero overhead for algorithm
+developers; (3) GNNBuilder supports end-to-end code generation, simulation,
+accelerator optimization, and hardware deployment, realizing a push-button
+fashion for GNN accelerator design; (4) GNNBuilder is equipped with accurate
+performance models of its generated accelerator, enabling fast and flexible
+design space exploration (DSE). In the experiments, first, we show that our
+accelerator performance model has errors within $36\%$ for latency prediction
+and $18\%$ for BRAM count prediction. Second, we show that our generated
+accelerators can outperform CPU by $6.33\times$ and GPU by $6.87\times$. This
+framework is open-source, and the code is available at
+https://github.com/sharc-lab/gnn-builder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 4 tables, 3 listings</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Your Negative May not Be True Negative: Boosting Image-Text Matching
+  with False Negative Elimination <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Yi Bin, Junrong Liao, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing image-text matching methods adopt triplet loss as the
+optimization objective, and choosing a proper negative sample for the triplet
+of <anchor, positive, negative> is important for effectively training the
+model, e.g., hard negatives make the model learn efficiently and effectively.
+However, we observe that existing methods mainly employ the most similar
+samples as hard negatives, which may not be true negatives. In other words, the
+samples with high similarity but not paired with the anchor may reserve
+positive semantic associations, and we call them false negatives. Repelling
+these false negatives in triplet loss would mislead the semantic representation
+learning and result in inferior retrieval performance. In this paper, we
+propose a novel False Negative Elimination (FNE) strategy to select negatives
+via sampling, which could alleviate the problem introduced by false negatives.
+Specifically, we first construct the distributions of positive and negative
+samples separately via their similarities with the anchor, based on the
+features extracted from image and text encoders. Then we calculate the false
+negative probability of a given sample based on its similarity with the anchor
+and the above distributions via the Bayes' rule, which is employed as the
+sampling weight during negative sampling process. Since there may not exist any
+false negative in a small batch size, we design a memory module with momentum
+to retain a large negative buffer and implement our negative sampling strategy
+spanning over the buffer. In addition, to make the model focus on hard
+negatives, we reassign the sampling weights for the simple negatives with a
+cut-down strategy. The extensive experiments are conducted on Flickr30K and
+MS-COCO, and the results demonstrate the superiority of our proposed false
+negative elimination strategy. The code is available at
+https://github.com/LuminosityX/FNE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSTFormer: Bridging Spiking Neural Network and Memory Support
+  <span class="highlight-title">Transformer</span> for Frame-Event based Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Zongzhen Wu, Yao Rong, Lin Zhu, Bo Jiang, Jin Tang, Yonghong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event camera-based pattern recognition is a newly arising research topic in
+recent years. Current researchers usually transform the event streams into
+images, graphs, or voxels, and adopt deep neural networks for event-based
+classification. Although good performance can be achieved on simple event
+recognition datasets, however, their results may be still limited due to the
+following two issues. Firstly, they adopt spatial sparse event streams for
+recognition only, which may fail to capture the color and detailed texture
+information well. Secondly, they adopt either Spiking Neural Networks (SNN) for
+energy-efficient recognition with suboptimal results, or Artificial Neural
+Networks (ANN) for energy-intensive, high-performance recognition. However,
+seldom of them consider achieving a balance between these two aspects. In this
+paper, we formally propose to recognize patterns by fusing RGB frames and event
+streams simultaneously and propose a new RGB frame-event recognition framework
+to address the aforementioned issues. The proposed method contains four main
+modules, i.e., memory support Transformer network for RGB frame encoding,
+spiking neural network for raw event stream encoding, multi-modal bottleneck
+fusion module for RGB-Event feature aggregation, and prediction head. Due to
+the scarce of RGB-Event based classification dataset, we also propose a
+large-scale PokerEvent dataset which contains 114 classes, and 27102
+frame-event pairs recorded using a DVS346 event camera. Extensive experiments
+on two RGB-Event based classification datasets fully validated the
+effectiveness of our proposed framework. We hope this work will boost the
+development of pattern recognition by fusing RGB frames and event streams. Both
+our dataset and source code of this work will be released at
+https://github.com/Event-AHU/SSTFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Peer Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unifying Two-Stream Encoders with <span class="highlight-title">Transformer</span>s for Cross-Modal Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Bin, Haoxuan Li, Yahui Xu, Xing Xu, Yang Yang, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing cross-modal retrieval methods employ two-stream encoders with
+different architectures for images and texts, \textit{e.g.}, CNN for images and
+RNN/Transformer for texts. Such discrepancy in architectures may induce
+different semantic distribution spaces and limit the interactions between
+images and texts, and further result in inferior alignment between images and
+texts. To fill this research gap, inspired by recent advances of Transformers
+in vision tasks, we propose to unify the encoder architectures with
+Transformers for both modalities. Specifically, we design a cross-modal
+retrieval framework purely based on two-stream Transformers, dubbed
+\textbf{Hierarchical Alignment Transformers (HAT)}, which consists of an image
+Transformer, a text Transformer, and a hierarchical alignment module. With such
+identical architectures, the encoders could produce representations with more
+similar characteristics for images and texts, and make the interactions and
+alignments between them much easier. Besides, to leverage the rich semantics,
+we devise a hierarchical alignment scheme to explore multi-level
+correspondences of different layers between images and texts. To evaluate the
+effectiveness of the proposed HAT, we conduct extensive experiments on two
+benchmark datasets, MSCOCO and Flickr30K. Experimental results demonstrate that
+HAT outperforms SOTA baselines by a large margin. Specifically, on two key
+tasks, \textit{i.e.}, image-to-text and text-to-image retrieval, HAT achieves
+7.6\% and 16.7\% relative score improvement of Recall@1 on MSCOCO, and 4.4\%
+and 11.6\% on Flickr30k respectively. The code is available at
+\url{https://github.com/LuminosityX/HAT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Edge Caching: a Meta Reinforcement Learning Approach with
+  Edge Sampling <span class="chip">ICME2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowei He, Yinan Mao, Shiji Zhou, Chen Ma, Zhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current learning-based edge caching schemes usually suffer from dynamic
+content popularity, e.g., in the emerging short video platforms, users' request
+patterns shift significantly over time and across different edges. An intuitive
+solution for a specific local edge cache is to collect more request histories
+from other edge caches. However, uniformly merging these request histories may
+not perform satisfactorily due to heterogeneous content distributions on
+different edges. To solve this problem, we propose a collaborative edge caching
+framework. First, we design a meta-learning-based collaborative strategy to
+guarantee that the local model can timely meet the continually changing content
+popularity. Then, we design an edge sampling method to select more "valuable"
+neighbor edges to participate in the local training. To evaluate the proposed
+framework, we conduct trace-driven experiments to demonstrate the effectiveness
+of our design: it improves the average cache hit rate by up to $10.12\%$
+(normalized) compared with other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on IEEE International Conference on Multimedia and Expo
+  2023 (ICME2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Top-Down Stereoscopic Image Quality Assessment via Stereo
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huilin Zhang, Sumei Li, Yongli Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stereoscopic image quality assessment (SIQA) plays a crucial role in
+evaluating and improving the visual experience of 3D content. Existing
+binocular properties and attention-based methods for SIQA have achieved
+promising performance. However, these bottom-up approaches are inadequate in
+exploiting the inherent characteristics of the human visual system (HVS). This
+paper presents a novel network for SIQA via stereo attention, employing a
+top-down perspective to guide the quality assessment process. Our proposed
+method realizes the guidance from high-level binocular signals down to
+low-level monocular signals, while the binocular and monocular information can
+be calibrated progressively throughout the processing pipeline. We design a
+generalized Stereo AttenTion (SAT) block to implement the top-down philosophy
+in stereo perception. This block utilizes the fusion-generated attention map as
+a high-level binocular modulator, influencing the representation of two
+low-level monocular features. Additionally, we introduce an Energy Coefficient
+(EC) to account for recent findings indicating that binocular responses in the
+primate primary visual cortex are less than the sum of monocular responses. The
+adaptive EC can tune the magnitude of binocular response flexibly, thus
+enhancing the formation of robust binocular features within our framework. To
+extract the most discriminative quality information from the summation and
+subtraction of the two branches of monocular features, we utilize a
+dual-pooling strategy that applies min-pooling and max-pooling operations to
+the respective branches. Experimental results highlight the superiority of our
+top-down method in simulating the property of visual perception and advancing
+the state-of-the-art in the SIQA field. The code of this work is available at
+https://github.com/Fanning-Zhang/SATNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Adaptive Video Streaming with Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchi Huang, Rui-Xiao Zhang, Chenglei Wu, Lifeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quality of Experience~(QoE)-driven adaptive bitrate~(ABR) algorithms are
+typically optimized using QoE models that are based on the mean opinion
+score~(MOS), while such principles may not account for user heterogeneity on
+rating scales, resulting in unexpected behaviors. In this paper, we propose
+\texttt{Jade}, which leverages reinforcement learning with human
+feedback~(RLHF) technologies to better align the users' opinion scores.
+\texttt{Jade}'s rank-based QoE model considers relative values of user ratings
+to interpret the subjective perception of video sessions. We implement
+linear-based and Deep Neural Network (DNN)-based architectures for satisfying
+both accuracy and generalization ability. We further propose entropy-aware
+reinforced mechanisms for training policies with the integration of the
+proposed QoE models. Experimental results demonstrate that \texttt{Jade}
+performs favorably on conventional metrics, such as quality and stall ratio,
+and improves QoE by 8.09\%-38.13\% in different network conditions, emphasizing
+the importance of user heterogeneity in QoE modeling and the potential of
+combining linear-based and DNN-based models for performance improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion
+  and Infinite Data Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyang Yu, Shihao Wang, Yuan Fang, Wangpeng An
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents OmniDataComposer, an innovative approach for multimodal
+data fusion and unlimited data generation with an intent to refine and
+uncomplicate interplay among diverse data modalities. Coming to the core
+breakthrough, it introduces a cohesive data structure proficient in processing
+and merging multimodal data inputs, which include video, audio, and text. Our
+crafted algorithm leverages advancements across multiple operations such as
+video/image caption extraction, dense caption extraction, Automatic Speech
+Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything
+Model(RAM), and object tracking. OmniDataComposer is capable of identifying
+over 6400 categories of objects, substantially broadening the spectrum of
+visual information. It amalgamates these diverse modalities, promoting
+reciprocal enhancement among modalities and facilitating cross-modal data
+correction. \textbf{The final output metamorphoses each video input into an
+elaborate sequential document}, virtually transmuting videos into thorough
+narratives, making them easier to be processed by large language models. Future
+prospects include optimizing datasets for each modality to encourage unlimited
+data generation. This robust base will offer priceless insights to models like
+ChatGPT, enabling them to create higher quality datasets for video captioning
+and easing question-answering tasks based on video content. OmniDataComposer
+inaugurates a new stage in multimodal learning, imparting enormous potential
+for augmenting AI's understanding and generation of complex, real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Color Recommendation in Vector Graphic Documents <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianru Qiu, Xueting Wang, Mayu Otani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Color selection plays a critical role in graphic document design and requires
+sufficient consideration of various contexts. However, recommending appropriate
+colors which harmonize with the other colors and textual contexts in documents
+is a challenging task, even for experienced designers. In this study, we
+propose a multimodal masked color model that integrates both color and textual
+contexts to provide text-aware color recommendation for graphic documents. Our
+proposed model comprises self-attention networks to capture the relationships
+between colors in multiple palettes, and cross-attention networks that
+incorporate both color and CLIP-based text representations. Our proposed method
+primarily focuses on color palette completion, which recommends colors based on
+the given colors and text. Additionally, it is applicable for another color
+recommendation task, full palette generation, which generates a complete color
+palette corresponding to the given text. Experimental results demonstrate that
+our proposed approach surpasses previous color palette completion methods on
+accuracy, color distribution, and user experience, as well as full palette
+generation methods concerning color diversity and similarity to the ground
+truth palettes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSAC: Multiple Speech Attribute Control Method for Speech Emotion
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant progress, speech emotion recognition (SER) remains
+challenging due to inherent complexity and ambiguity of the emotion attribute,
+particularly in wild world. Whereas current studies primarily focus on
+recognition and generalization capabilities, this work pioneers an exploration
+into the reliability of SER methods and investigates how to model the speech
+emotion from the aspect of data distribution across various speech attributes.
+Specifically, we first build a novel CNN-based SER model which adopts additive
+margin softmax loss to expand the distance between features of different
+classes, thereby enhancing their discrimination. Second, a novel multiple
+speech attribute control method MSAC is proposed to explicitly control speech
+attributes, enabling the model to be less affected by emotion-agnostic
+attributes and capture more fine-grained emotion-related features. Third, we
+make a first attempt to test and analyze the reliability of the proposed SER
+workflow using the out-of-distribution detection method. Extensive experiments
+on both single and cross-corpus SER scenarios show that our proposed unified
+SER workflow consistently outperforms the baseline in terms of recognition,
+generalization, and reliability performance. Besides, in single-corpus SER, the
+proposed SER workflow achieves superior recognition results with a WAR of
+72.97\% and a UAR of 71.76\% on the IEMOCAP corpus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Pan-sharpening with Memories of Spatial Details 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16181v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16181v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maoxun Yuan, Tianyi Zhao, Bo Li, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pan-sharpening, as one of the most commonly used techniques in remote sensing
+systems, aims to inject spatial details from panchromatic images into
+multispectral images (MS) to obtain high-resolution multispectral images. Since
+deep learning has received widespread attention because of its powerful fitting
+ability and efficient feature extraction, a variety of pan-sharpening methods
+have been proposed to achieve remarkable performance. However, current
+pan-sharpening methods usually require the paired panchromatic (PAN) and MS
+images as input, which limits their usage in some scenarios. To address this
+issue, in this paper we observe that the spatial details from PAN images are
+mainly high-frequency cues, i.e., the edges reflect the contour of input PAN
+images. This motivates us to develop a PAN-agnostic representation to store
+some base edges, so as to compose the contour for the corresponding PAN image
+via them. As a result, we can perform the pan-sharpening task with only the MS
+image when inference. To this end, a memory-based network is adapted to extract
+and memorize the spatial details during the training phase and is used to
+replace the process of obtaining spatial information from PAN images when
+inference, which is called Memory-based Spatial Details Network (MSDN).
+Finally, we integrate the proposed MSDN module into the existing deep
+learning-based pan-sharpening methods to achieve an end-to-end pan-sharpening
+network. With extensive experiments on the Gaofen1 and WorldView-4 satellites,
+we verify that our method constructs good spatial details without PAN images
+and achieves the best performance. The code is available at
+https://github.com/Zhao-Tian-yi/Learning-to-Pan-sharpening-with-Memories-of-Spatial-Details.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio
+  <span class="highlight-title">Pretrain</span>ing for Speech Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07848v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07848v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Pan, Yanni Hu, Yuguang Yang, Jixun Yao, Wen Fei, Lei Ma, Heng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based cross-modality pretraining approaches have
+recently exhibited impressive success in diverse fields. In this paper, we
+propose GEmo-CLAP, a kind of gender-attribute-enhanced contrastive
+language-audio pretraining (CLAP) method for speech emotion recognition.
+Specifically, a novel emotion CLAP model (Emo-CLAP) is first built, utilizing
+pre-trained WavLM and RoBERTa models. Second, given the significance of the
+gender attribute in speech emotion modeling, two novel soft label based
+GEmo-CLAP (SL-GEmo-CLAP) and multi-task learning based GEmo-CLAP (ML-GEmo-CLAP)
+models are further proposed to integrate emotion and gender information of
+speech signals, forming more reasonable objectives. Extensive experiments on
+IEMOCAP show that our proposed two GEmo-CLAP models consistently outperform the
+baseline Emo-CLAP, while also achieving the best recognition performance
+compared with recent state-of-the-art methods. Noticeably, the proposed
+SL-GEmo-CLAP model achieves the best UAR of 81.43\% and WAR of 83.16\% which
+performs better than other state-of-the-art SER methods by at least 3\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAE-DFER: Efficient Masked Autoencoder for <span class="highlight-title">Self-supervised</span> Dynamic
+  Facial Expression Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02227v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02227v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licai Sun, Zheng Lian, Bin Liu, Jianhua Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic facial expression recognition (DFER) is essential to the development
+of intelligent and empathetic machines. Prior efforts in this field mainly fall
+into supervised learning paradigm, which is severely restricted by the limited
+labeled data in existing datasets. Inspired by recent unprecedented success of
+masked autoencoders (e.g., VideoMAE), this paper proposes MAE-DFER, a novel
+self-supervised method which leverages large-scale self-supervised pre-training
+on abundant unlabeled data to largely advance the development of DFER. Since
+the vanilla Vision Transformer (ViT) employed in VideoMAE requires substantial
+computation during fine-tuning, MAE-DFER develops an efficient local-global
+interaction Transformer (LGI-Former) as the encoder. Moreover, in addition to
+the standalone appearance content reconstruction in VideoMAE, MAE-DFER also
+introduces explicit temporal facial motion modeling to encourage LGI-Former to
+excavate both static appearance and dynamic motion information. Extensive
+experiments on six datasets show that MAE-DFER consistently outperforms
+state-of-the-art supervised methods by significant margins (e.g., +6.30\% UAR
+on DFEW and +8.34\% UAR on MAFW), verifying that it can learn powerful dynamic
+facial representations via large-scale self-supervised pre-training. Besides,
+it has comparable or even better performance than VideoMAE, while largely
+reducing the computational cost (about 38\% FLOPs). We believe MAE-DFER has
+paved a new way for the advancement of DFER and can inspire more relevant
+research in this field and even other related tasks. Codes and models are
+publicly available at https://github.com/sunlicai/MAE-DFER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 (camera ready). Codes and models are publicly available
+  at https://github.com/sunlicai/MAE-DFER</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-07T00:00:00Z">2023-08-07</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">70</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What about translation? New coding system for content analysis on the
+  perception of literary translation around the political transformation in
+  1989 in Hungary as a classification problem on an unbalanced <span class="highlight-title">dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dalma Galambos, Pál Zsámboki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To track trends in the perception of literary translation around the
+political transformation in 1989 in Hungary, a coding system was developed on
+the paragraphs of the 1980-1999 issues of the literary journal Alf\"old. This
+paper describes how we trained BERT models to carry over the coding system to
+the 1980-1999 issues of the literary journal Nagyvil\'ag. We use extensive
+hyperparameter tuning, loss functions robust to label unbalance, 10-fold
+cross-validation for precise evaluations and a model ensemble for prediction,
+manual validation on the predict set, a new calibration method to better
+predict label counts for sections of the Nagyvil\'ag corpus, and to study the
+relations between labels, we construct label relation networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentBench: Evaluating LLMs as Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are becoming increasingly smart and autonomous,
+targeting real-world pragmatic missions beyond traditional NLP tasks. As a
+result, there has been an urgent need to evaluate LLMs as agents on challenging
+tasks in interactive environments. We present AgentBench, a multi-dimensional
+evolving benchmark that currently consists of 8 distinct environments to assess
+LLM-as-Agent's reasoning and decision-making abilities in a multi-turn
+open-ended generation setting. Our extensive test over 25 LLMs (including APIs
+and open-sourced models) shows that, while top commercial LLMs present a strong
+ability of acting as agents in complex environments, there is a significant
+disparity in performance between them and open-sourced competitors. It also
+serves as a component of an ongoing project with wider coverage and deeper
+consideration towards systematic LLM evaluation. Datasets, environments, and an
+integrated evaluation package for AgentBench are released at
+https://github.com/THUDM/AgentBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Spells in Fantasy Literature with a <span class="highlight-title">Transformer</span> Based
+  Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcel Moravek, Alexander Zender, Andreas Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer architectures and models have made significant progress in
+language-based tasks. In this area, is BERT one of the most widely used and
+freely available transformer architecture. In our work, we use BERT for
+context-based phrase recognition of magic spells in the Harry Potter novel
+series. Spells are a common part of active magic in fantasy novels. Typically,
+spells are used in a specific context to achieve a supernatural effect. A
+series of investigations were conducted to see if a Transformer architecture
+could recognize such phrases based on their context in the Harry Potter saga.
+For our studies a pre-trained BERT model was used and fine-tuned utilising
+different datasets and training methods to identify the searched context. By
+considering different approaches for sequence classification as well as token
+classification, it is shown that the context of spells can be recognised.
+According to our investigations, the examined sequence length for fine-tuning
+and validation of the model plays a significant role in context recognition.
+Based on this, we have investigated whether spells have overarching properties
+that allow a transfer of the neural network models to other fantasy universes
+as well. The application of our model showed promising results and is worth to
+be deepened in subsequent studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emotionally Numb or Empathetic? Evaluating How LLMs Feel Using
+  EmotionBench 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03656v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03656v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jen-tse Huang, Man Ho Lam, Eric John Li, Shujie Ren, Wenxuan Wang, Wenxiang Jiao, Zhaopeng Tu, Michael R. Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the community has witnessed the advancement of Large Language
+Models (LLMs), which have shown remarkable performance on various downstream
+tasks. Led by powerful models like ChatGPT and Claude, LLMs are revolutionizing
+how users engage with software, assuming more than mere tools but intelligent
+assistants. Consequently, evaluating LLMs' anthropomorphic capabilities becomes
+increasingly important in contemporary discourse. Utilizing the emotion
+appraisal theory from psychology, we propose to evaluate the empathy ability of
+LLMs, i.e., how their feelings change when presented with specific situations.
+After a careful and comprehensive survey, we collect a dataset containing over
+400 situations that have proven effective in eliciting the eight emotions
+central to our study. Categorizing the situations into 36 factors, we conduct a
+human evaluation involving more than 1,200 subjects worldwide. With the human
+evaluation results as references, our evaluation includes five LLMs, covering
+both commercial and open-source models, including variations in model sizes,
+featuring the latest iterations, such as GPT-4 and LLaMA 2. A conclusion can be
+drawn from the results that, despite several misalignments, LLMs can generally
+respond appropriately to certain situations. Nevertheless, they fall short in
+alignment with the emotional behaviors of human beings and cannot establish
+connections between similar situations. Our collected dataset of situations,
+the human evaluation results, and the code of our testing framework, dubbed
+EmotionBench, is made publicly in https://github.com/CUHK-ARISE/EmotionBench.
+We aspire to contribute to the advancement of LLMs regarding better alignment
+with the emotional behaviors of human beings, thereby enhancing their utility
+and applicability as intelligent assistants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KITLM: Domain-Specific Knowledge InTegration into Language Models for
+  Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankush Agarwal, Sakharam Gawade, Amar Prakash Azad, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable performance in a
+wide range of natural language tasks. However, as these models continue to grow
+in size, they face significant challenges in terms of computational costs.
+Additionally, LLMs often lack efficient domain-specific understanding, which is
+particularly crucial in specialized fields such as aviation and healthcare. To
+boost the domain-specific understanding, we propose, KITLM, a novel knowledge
+base integration approach into language model through relevant information
+infusion. By integrating pertinent knowledge, not only the performance of the
+language model is greatly enhanced, but the model size requirement is also
+significantly reduced while achieving comparable performance. Our proposed
+knowledge-infused model surpasses the performance of both GPT-3.5-turbo and the
+state-of-the-art knowledge infusion method, SKILL, achieving over 1.5 times
+improvement in exact match scores on the MetaQA. KITLM showed a similar
+performance boost in the aviation domain with AeroQA. The drastic performance
+improvement of KITLM over the existing methods can be attributed to the
+infusion of relevant knowledge while mitigating noise. In addition, we release
+two curated datasets to accelerate knowledge infusion research in specialized
+fields: a) AeroQA, a new benchmark dataset designed for multi-hop
+question-answering within the aviation domain, and b) Aviation Corpus, a
+dataset constructed from unstructured text extracted from the National
+Transportation Safety Board reports. Our research contributes to advancing the
+field of domain-specific language understanding and showcases the potential of
+knowledge infusion techniques in improving the performance of language models
+on question-answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Negative Lexical Constraints in Neural Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Jon, Dušan Variš, Michal Novák, João Paulo Aires, Ondřej Bojar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores negative lexical constraining in English to Czech neural
+machine translation. Negative lexical constraining is used to prohibit certain
+words or expressions in the translation produced by the neural translation
+model. We compared various methods based on modifying either the decoding
+process or the training data. The comparison was performed on two tasks:
+paraphrasing and feedback-based translation refinement. We also studied to
+which extent these methods "evade" the constraints presented to the model
+(usually in the dictionary form) by generating a different surface form of a
+given constraint.We propose a way to mitigate the issue through training with
+stemmed negative constraints to counter the model's ability to induce a variety
+of the surface forms of a word that can result in bypassing the constraint. We
+demonstrate that our method improves the constraining, although the problem
+still persists in many cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WIKITIDE: A Wikipedia-Based Timestamped Definition Pairs <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsuvas Borkakoty, Luis Espinosa-Anke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental challenge in the current NLP context, dominated by language
+models, comes from the inflexibility of current architectures to 'learn' new
+information. While model-centric solutions like continual learning or
+parameter-efficient fine tuning are available, the question still remains of
+how to reliably identify changes in language or in the world. In this paper, we
+propose WikiTiDe, a dataset derived from pairs of timestamped definitions
+extracted from Wikipedia. We argue that such resource can be helpful for
+accelerating diachronic NLP, specifically, for training models able to scan
+knowledge resources for core updates concerning a concept, an event, or a named
+entity. Our proposed end-to-end method is fully automatic, and leverages a
+bootstrapping algorithm for gradually creating a high-quality dataset. Our
+results suggest that bootstrapping the seed version of WikiTiDe leads to better
+fine-tuned models. We also leverage fine-tuned models in a number of downstream
+tasks, showing promising results with respect to competitive baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RANLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Controllable Natural Language Inference through Lexical
+  Inference Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingji Zhang, Danilo S. Carvalho, Ian Pratt-Hartmann, Andre Freitas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable natural language inference aims to provide a mechanism to produce
+explanatory (abductive) inference chains which ground claims to their
+supporting premises. A recent corpus called EntailmentBank strives to advance
+this task by explaining the answer to a question using an entailment tree
+\cite{dalvi2021explaining}. They employ the T5 model to directly generate the
+tree, which can explain how the answer is inferred. However, it lacks the
+ability to explain and control the generation of intermediate steps, which is
+crucial for the multi-hop inference process. % One recent corpus,
+EntailmentBank, aims to push this task forward by explaining an answer to a
+question according to an entailment tree \cite{dalvi2021explaining}. They
+employ T5 to generate the tree directly, which can explain how the answer is
+inferred but cannot explain how the intermediate is generated, which is
+essential to the multi-hop inference process. In this work, we focus on
+proposing a controlled natural language inference architecture for
+multi-premise explanatory inference. To improve control and enable explanatory
+analysis over the generation, we define lexical inference types based on
+Abstract Meaning Representation (AMR) graph and modify the architecture of T5
+to learn a latent sentence representation (T5 bottleneck) conditioned on said
+type information. We also deliver a dataset of approximately 5000 annotated
+explanatory inference steps, with well-grounded lexical-symbolic operations.
+Experimental results indicate that the inference typing induced at the T5
+bottleneck can help T5 to generate a conclusion under explicit control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topological Interpretations of <span class="highlight-title">GPT</span>-3 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Sun, Bradley Nelson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is an experiential study of investigating a consistent method for
+deriving the correlation between sentence vector and semantic meaning of a
+sentence. We first used three state-of-the-art word/sentence embedding methods
+including GPT-3, Word2Vec, and Sentence-BERT, to embed plain text sentence
+strings into high dimensional spaces. Then we compute the pairwise distance
+between any possible combination of two sentence vectors in an embedding space
+and map them into a matrix. Based on each distance matrix, we compute the
+correlation of distances of a sentence vector with respect to the other
+sentence vectors in an embedding space. Then we compute the correlation of each
+pair of the distance matrices. We observed correlations of the same sentence in
+different embedding spaces and correlations of different sentences in the same
+embedding space. These observations are consistent with our hypothesis and take
+us to the next stage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>70 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mondrian: <span class="highlight-title">Prompt</span> Abstraction Attack Against Large Language Models for
+  Cheaper API Pricing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wai Man Si, Michael Backes, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Machine Learning as a Service (MLaaS) market is rapidly expanding and
+becoming more mature. For example, OpenAI's ChatGPT is an advanced large
+language model (LLM) that generates responses for various queries with
+associated fees. Although these models can deliver satisfactory performance,
+they are far from perfect. Researchers have long studied the vulnerabilities
+and limitations of LLMs, such as adversarial attacks and model toxicity.
+Inevitably, commercial ML models are also not exempt from such issues, which
+can be problematic as MLaaS continues to grow. In this paper, we discover a new
+attack strategy against LLM APIs, namely the prompt abstraction attack.
+Specifically, we propose Mondrian, a simple and straightforward method that
+abstracts sentences, which can lower the cost of using LLM APIs. In this
+approach, the adversary first creates a pseudo API (with a lower established
+price) to serve as the proxy of the target API (with a higher established
+price). Next, the pseudo API leverages Mondrian to modify the user query,
+obtain the abstracted response from the target API, and forward it back to the
+end user. Our results show that Mondrian successfully reduces user queries'
+token length ranging from 13% to 23% across various tasks, including text
+classification, generation, and question answering. Meanwhile, these abstracted
+queries do not significantly affect the utility of task-specific and general
+language models like ChatGPT. Mondrian also reduces instruction prompts' token
+length by at least 11% without compromising output quality. As a result, the
+prompt abstraction attack enables the adversary to profit without bearing the
+cost of API development and deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language
+  Model through Expert Feedback and Real-world Multi-turn Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songhua Yang, Hanjia Zhao, Senbin Zhu, Guangyu Zhou, Hongfei Xu, Yuxiang Jia, Hongying Zan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Large Language Models (LLMs) have achieved remarkable
+breakthroughs in understanding and responding to user intents. However, their
+performance lag behind general use cases in some expertise domains, such as
+Chinese medicine. Existing efforts to incorporate Chinese medicine into LLMs
+rely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue
+data. These models lack the ability for doctor-like proactive inquiry and
+multi-turn comprehension and cannot always align responses with safety and
+professionalism experts. In this work, we introduce Zhongjing, the first
+Chinese medical LLaMA-based LLM that implements an entire training pipeline
+from pre-training to reinforcement learning with human feedback (RLHF).
+Additionally, we introduce a Chinese multi-turn medical dialogue dataset of
+70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly
+enhances the model's capability for complex dialogue and proactive inquiry
+initiation. We define a refined annotation rule and evaluation criteria given
+the biomedical domain's unique characteristics. Results show that our model
+outperforms baselines in various capacities and matches the performance of
+ChatGPT in a few abilities, despite having 50x training data with previous best
+model and 100x parameters with ChatGPT. RLHF further improves the model's
+instruction-following ability and safety. We also release our code, datasets
+and model for further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring Variety, Balance, and Disparity: An Analysis of Media Coverage
+  of the 2021 German Federal Election 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Färber, Jannik Schwade, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Determining and measuring diversity in news articles is important for a
+number of reasons, including preventing filter bubbles and fueling public
+discourse, especially before elections. So far, the identification and analysis
+of diversity have been illuminated in a variety of ways, such as measuring the
+overlap of words or topics between news articles related to US elections.
+However, the question of how diversity in news articles can be measured
+holistically, i.e., with respect to (1) variety, (2) balance, and (3)
+disparity, considering individuals, parties, and topics, has not been
+addressed. In this paper, we present a framework for determining diversity in
+news articles according to these dimensions. Furthermore, we create and provide
+a dataset of Google Top Stories, encompassing more than 26,000 unique headlines
+from more than 900 news outlets collected within two weeks before and after the
+2021 German federal election. While we observe high diversity for more general
+search terms (e.g., "election"), a range of search terms ("education,"
+"Europe," "climate protection," "government") resulted in news articles with
+high diversity in two out of three dimensions. This reflects a more subjective,
+dedicated discussion on rather future-oriented topics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vocab-Expander: A System for Creating Domain-Specific Vocabularies Based
+  on Word Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Färber, Nicholas Popovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose Vocab-Expander at https://vocab-expander.com, an
+online tool that enables end-users (e.g., technology scouts) to create and
+expand a vocabulary of their domain of interest. It utilizes an ensemble of
+state-of-the-art word embedding techniques based on web text and ConceptNet, a
+common-sense knowledge base, to suggest related terms for already given terms.
+The system has an easy-to-use interface that allows users to quickly confirm or
+reject term suggestions. Vocab-Expander offers a variety of potential use
+cases, such as improving concept-based information retrieval in technology and
+innovation management, enhancing communication and collaboration within
+organizations or interdisciplinary projects, and creating vocabularies for
+specific courses in education.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at RANLP'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-preserving Pruning for <span class="highlight-title">Pre-train</span>ed Language Models without
+  Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungcheol Park, Hojun Choi, U Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a pre-trained language model, how can we efficiently compress it
+without retraining? Retraining-free structured pruning algorithms are crucial
+in pre-trained language model compression due to their significantly reduced
+pruning cost and capability to prune large language models. However, existing
+retraining-free algorithms encounter severe accuracy degradation, as they fail
+to preserve the useful knowledge of pre-trained models. In this paper, we
+propose K-pruning (Knowledge-preserving pruning), an accurate retraining-free
+structured pruning algorithm for pre-trained language models. K-pruning
+identifies and prunes attention heads and neurons deemed to be superfluous,
+based on the amount of their inherent knowledge. K-pruning applies an iterative
+process of pruning followed by knowledge reconstruction for each sub-layer to
+preserve the knowledge of the pre-trained models. Consequently, K-pruning shows
+up to 58.02%p higher F1 score than existing retraining-free pruning algorithms
+under a high compression rate of 80% on the SQuAD benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RCMHA: Relative Convolutional Multi-Head Attention for Natural Language
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Herman Sugiharto,  Aradea, Husni Mubarok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Attention module finds common usage in language modeling, presenting
+distinct challenges within the broader scope of Natural Language Processing.
+Multi-Head Attention (MHA) employs an absolute positional encoding, which
+imposes limitations on token length and entails substantial memory consumption
+during the processing of embedded inputs. The current remedy proposed by
+researchers involves the utilization of relative positional encoding, similar
+to the approach adopted in Transformer-XL or Relative Multi-Head Attention
+(RMHA), albeit the employed architecture consumes considerable memory
+resources. To address these challenges, this study endeavors to refine MHA,
+leveraging relative positional encoding in conjunction with the Depth-Wise
+Convolutional Layer architecture, which promises heightened accuracy coupled
+with minimized memory usage. The proposed RCMHA framework entails the
+modification of two integral components: firstly, the application of the
+Depth-Wise Convolutional Layer to the input embedding, encompassing Query, Key,
+and Value parameters; secondly, the incorporation of Relative Positional
+Encoding into the attention scoring phase, harmoniously integrated with Scaled
+Dot-Product Attention. Empirical experiments underscore the advantages of
+RCMHA, wherein it exhibits superior accuracy, boasting a score of 0.572 in
+comparison to alternative attention modules such as MHA, Multi-DConv-Head
+Attention (MDHA), and RMHA. Concerning memory utilization, RMHA emerges as the
+most frugal, demonstrating an average consumption of 2.98 GB, surpassing RMHA
+which necessitates 3.5 GB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Chinese ASR Error Correction with Dynamic Error Scaling
+  Mechanism <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Fan, Yong Zhang, Hanzhang Li, Jianzong Wang, Zhitao Li, Sheng Ouyang, Ning Cheng, Jing Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chinese Automatic Speech Recognition (ASR) error correction presents
+significant challenges due to the Chinese language's unique features, including
+a large character set and borderless, morpheme-based structure. Current
+mainstream models often struggle with effectively utilizing word-level features
+and phonetic information. This paper introduces a novel approach that
+incorporates a dynamic error scaling mechanism to detect and correct
+phonetically erroneous text generated by ASR output. This mechanism operates by
+dynamically fusing word-level features and phonetic information, thereby
+enriching the model with additional semantic data. Furthermore, our method
+implements unique error reduction and amplification strategies to address the
+issues of matching wrong words caused by incorrect characters. Experimental
+results indicate substantial improvements in ASR error correction,
+demonstrating the effectiveness of our proposed method and yielding promising
+results on established datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 24th Annual Conference of the International Speech
+  Communication Association (INTERSPEECH 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Guided Copy Mechanism for Conversational Question Answering <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Zhang, Zhitao Li, Jianzong Wang, Yiming Gao, Ning Cheng, Fengying Yu, Jing Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational Question Answering (CQA) is a challenging task that aims to
+generate natural answers for conversational flow questions. In this paper, we
+propose a pluggable approach for extractive methods that introduces a novel
+prompt-guided copy mechanism to improve the fluency and appropriateness of the
+extracted answers. Our approach uses prompts to link questions to answers and
+employs attention to guide the copy mechanism to verify the naturalness of
+extracted answers, making necessary edits to ensure that the answers are fluent
+and appropriate. The three prompts, including a question-rationale relationship
+prompt, a question description prompt, and a conversation history prompt,
+enhance the copy mechanism's performance. Our experiments demonstrate that this
+approach effectively promotes the generation of natural answers and achieves
+good results in the CoQA challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 24th Annual Conference of the International Speech
+  Communication Association (INTERSPEECH 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recycle<span class="highlight-title">GPT</span>: An Autoregressive Language Model with Recyclable Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Jiang, Qiaozhi He, Xiaomin Zhuang, Zhihua Wu, Kunpeng Wang, Wenlai Zhao, Guangwen Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing large language models have to run K times to generate a sequence of
+K tokens. In this paper, we present RecycleGPT, a generative language model
+with fast decoding speed by recycling pre-generated model states without
+running the whole model in multiple steps. Our approach relies on the
+observation that adjacent tokens in a sequence usually have strong correlations
+and the next token in a sequence can be reasonably guessed or inferred based on
+the preceding ones. Through theoretical evaluations and practical tests on
+downstream text generation tasks, we demonstrate the effectiveness of our
+approach in lowering inference latency, achieving up to 1.4x speedup while
+preserving high performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Evaluation for Low-Latency Simultaneous Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Huber, Tu Anh Dinh, Carlos Mullov, Ngoc Quan Pham, Thai Binh Nguyen, Fabian Retkowski, Stefan Constantin, Enes Yavuz Ugan, Danni Liu, Zhaolin Li, Sai Koneru, Jan Niehues, Alexander Waibel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of low-latency speech translation has recently draw significant
+interest in the research community as shown by several publications and shared
+tasks. Therefore, it is essential to evaluate these different approaches in
+realistic scenarios. However, currently only specific aspects of the systems
+are evaluated and often it is not possible to compare different approaches.
+  In this work, we propose the first framework to perform and evaluate the
+various aspects of low-latency speech translation under realistic conditions.
+The evaluation is carried out in an end-to-end fashion. This includes the
+segmentation of the audio as well as the run-time of the different components.
+  Secondly, we compare different approaches to low-latency speech translation
+using this framework. We evaluate models with the option to revise the output
+as well as methods with fixed output. Furthermore, we directly compare
+state-of-the-art cascaded as well as end-to-end systems. Finally, the framework
+allows to automatically evaluate the translation quality as well as latency and
+also provides a web interface to show the low-latency model outputs to the
+user.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine
+  Lexicon-based Retriever <span class="chip">NLPCC2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijue Huang, Bingbing Wang, Libo Qin, Qin Zhao, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot and zero-shot entity linking focus on the tail and emerging
+entities, which are more challenging but closer to real-world scenarios. The
+mainstream method is the ''retrieve and rerank'' two-stage framework. In this
+paper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity
+candidates in an effective manner, which operates in two layers. The first
+layer retrieves coarse-grained candidates by leveraging entity names, while the
+second layer narrows down the search to fine-grained candidates within the
+coarse-grained ones. In addition, this second layer utilizes entity
+descriptions to effectively disambiguate tail or new entities that share names
+with existing popular entities. Experimental results indicate that our approach
+can obtain superior performance without requiring extensive finetuning in the
+retrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task
+6 on Chinese Few-shot and Zero-shot Entity Linking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NLPCC2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coupling Symbolic Reasoning with Language Modeling for Efficient
+  Longitudinal Understanding of Unstructured Electronic Medical Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivani Shekhar, Simran Tiwari, T. C. Rensink, Ramy Eskander, Wael Salloum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of Artificial Intelligence (AI) in healthcare has been
+revolutionary, especially with the recent advancements in transformer-based
+Large Language Models (LLMs). However, the task of understanding unstructured
+electronic medical records remains a challenge given the nature of the records
+(e.g., disorganization, inconsistency, and redundancy) and the inability of
+LLMs to derive reasoning paradigms that allow for comprehensive understanding
+of medical variables. In this work, we examine the power of coupling symbolic
+reasoning with language modeling toward improved understanding of unstructured
+clinical texts. We show that such a combination improves the extraction of
+several medical variables from unstructured records. In addition, we show that
+the state-of-the-art commercially-free LLMs enjoy retrieval capabilities
+comparable to those provided by their commercial counterparts. Finally, we
+elaborate on the need for LLM steering through the application of symbolic
+reasoning as the exclusive use of LLMs results in the lowest performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering
+  <span class="highlight-title">Dataset</span> for Scientific Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengzhi Li, Nima Tajbakhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present SciGraphQA, a synthetic multi-turn question-answer
+dataset related to academic graphs. SciGraphQA is 13 times larger than
+ChartVQA, the previously largest chart-visual question-answering dataset. It is
+also the largest open-sourced chart VQA dataset with non-synthetic charts. To
+build our dataset, we selected 290,000 Computer Science or Machine Learning
+ArXiv papers published between 2010 and 2020, and then used Palm-2 to generate
+295K samples of open-vocabulary multi-turn question-answering dialogues about
+the graphs. As context, we provided the text-only Palm-2 with paper title,
+abstract, paragraph mentioning the graph, and rich text contextual data from
+the graph itself, obtaining dialogues with an average 2.23 question-answer
+turns for each graph. We asked GPT-4 to assess the matching quality of our
+question-answer turns given the paper's context, obtaining an average rating of
+8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most
+popular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our
+dataset, finding LLaVA-13B being the most performant with a CIDEr score of
+0.08. We further enriched the question prompts for LLAVA by including the
+serialized data tables extracted from the graphs using the DePlot model,
+boosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,
+we also fine-tuned LLaVa using our dataset, reaching a substantially higher
+CIDEr score of 0.26. We anticipate further accuracy improvement by including
+segmentation mask tokens and leveraging larger LLM backbones coupled with
+emergent prompting techniques. Our code and data are open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossTalk: Enhancing Communication and Collaboration in
+  Videoconferencing with Intent Recognition from Conversational Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haijun Xia, Tony Wang, Aditya Gunturu, Peiling Jiang, William Duan, Xiaoshuo Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advances and ubiquity of digital communication media such as
+videoconferencing and virtual reality, they remain oblivious to the rich
+intentions expressed by users. Beyond transmitting audio, videos, and messages,
+we envision digital communication media as proactive facilitators that can
+provide unobtrusive assistance to enhance communication and collaboration.
+Informed by the results of a formative study, we propose three key design
+concepts to explore the systematic integration of intelligence into
+communication and collaboration, including the panel substrate, language-based
+intent recognition, and lightweight interaction techniques. We developed
+CrossTalk, a videoconferencing system that instantiates these concepts, which
+was found to enable a more fluid and flexible communication and collaboration
+experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models
+  Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longteng Zhang, Lin Zhang, Shaohuai Shi, Xiaowen Chu, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The low-rank adaptation (LoRA) method can largely reduce the amount of
+trainable parameters for fine-tuning large language models (LLMs), however, it
+still requires expensive activation memory to update low-rank weights. Reducing
+the number of LoRA layers or using activation recomputation could harm the
+fine-tuning performance or increase the computational overhead. In this work,
+we present LoRA-FA, a memory-efficient fine-tuning method that reduces the
+activation memory without performance degradation and expensive recomputation.
+LoRA-FA chooses to freeze the projection-down weight of $A$ and update the
+projection-up weight of $B$ in each LoRA layer. It ensures the change of model
+weight reside in a low-rank space during LLMs fine-tuning, while eliminating
+the requirement to store full-rank input activations. We conduct extensive
+experiments across multiple model types (RoBERTa, T5, LLaMA) and model scales.
+Our results show that LoRA-FA can always achieve close fine-tuning accuracy
+across different tasks compared to full parameter fine-tuning and LoRA.
+Furthermore, LoRA-FA can reduce the overall memory cost by up to 1.4$\times$
+compared to LoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Large Language Model Generalization with Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roger Grosse, Juhan Bae, Cem Anil, Nelson Elhage, Alex Tamkin, Amirhossein Tajdini, Benoit Steiner, Dustin Li, Esin Durmus, Ethan Perez, Evan Hubinger, Kamilė Lukošiūtė, Karina Nguyen, Nicholas Joseph, Sam McCandlish, Jared Kaplan, Samuel R. Bowman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When trying to gain better visibility into a machine learning model in order
+to understand and mitigate the associated risks, a potentially valuable source
+of evidence is: which training examples most contribute to a given behavior?
+Influence functions aim to answer a counterfactual: how would the model's
+parameters (and hence its outputs) change if a given sequence were added to the
+training set? While influence functions have produced insights for small
+models, they are difficult to scale to large language models (LLMs) due to the
+difficulty of computing an inverse-Hessian-vector product (IHVP). We use the
+Eigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)
+approximation to scale influence functions up to LLMs with up to 52 billion
+parameters. In our experiments, EK-FAC achieves similar accuracy to traditional
+influence function estimators despite the IHVP computation being orders of
+magnitude faster. We investigate two algorithmic techniques to reduce the cost
+of computing gradients of candidate training sequences: TF-IDF filtering and
+query batching. We use influence functions to investigate the generalization
+patterns of LLMs, including the sparsity of the influence patterns, increasing
+abstraction with scale, math and programming abilities, cross-lingual
+generalization, and role-playing behavior. Despite many apparently
+sophisticated forms of generalization, we identify a surprising limitation:
+influences decay to near-zero when the order of key phrases is flipped.
+Overall, influence functions give us a powerful new tool for studying the
+generalization properties of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>119 pages, 47 figures, 22 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialogue Systems Can Generate Appropriate Responses without the Use of
+  Question Marks? -- Investigation of the Effects of Question Marks on Dialogue
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomoya Mizumoto, Takato Yamazaki, Katsumasa Yoshikawa, Masaya Ohagi, Toshiki Kawamoto, Toshinori Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When individuals engage in spoken discourse, various phenomena can be
+observed that differ from those that are apparent in text-based conversation.
+While written communication commonly uses a question mark to denote a query, in
+spoken discourse, queries are frequently indicated by a rising intonation at
+the end of a sentence. However, numerous speech recognition engines do not
+append a question mark to recognized queries, presenting a challenge when
+creating a spoken dialogue system. Specifically, the absence of a question mark
+at the end of a sentence can impede the generation of appropriate responses to
+queries in spoken dialogue systems. Hence, we investigate the impact of
+question marks on dialogue systems, with the results showing that they have a
+significant impact. Moreover, we analyze specific examples in an effort to
+determine which types of utterances have the impact on dialogue systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynJax: Structured Probability Distributions for JAX 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Stanojević, Laurent Sartran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of deep learning software libraries enabled significant
+progress in the field by allowing users to focus on modeling, while letting the
+library to take care of the tedious and time-consuming task of optimizing
+execution for modern hardware accelerators. However, this has benefited only
+particular types of deep learning models, such as Transformers, whose
+primitives map easily to the vectorized computation. The models that explicitly
+account for structured objects, such as trees and segmentations, did not
+benefit equally because they require custom algorithms that are difficult to
+implement in a vectorized form.
+  SynJax directly addresses this problem by providing an efficient vectorized
+implementation of inference algorithms for structured distributions covering
+alignment, tagging, segmentation, constituency trees and spanning trees. With
+SynJax we can build large-scale differentiable models that explicitly model
+structure in the data. The code is available at
+https://github.com/deepmind/synjax.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards General Text Embeddings with Multi-stage Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, Meishan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present GTE, a general-purpose text embedding model trained with
+multi-stage contrastive learning. In line with recent advancements in unifying
+various NLP tasks into a single format, we train a unified text embedding model
+by employing contrastive learning over a diverse mixture of datasets from
+multiple sources. By significantly increasing the number of training data
+during both unsupervised pre-training and supervised fine-tuning stages, we
+achieve substantial performance gains over existing embedding models. Notably,
+even with a relatively modest parameter count of 110M, GTE$_\text{base}$
+outperforms the black-box embedding API provided by OpenAI and even surpasses
+10x larger text embedding models on the massive text embedding benchmark.
+Furthermore, without additional fine-tuning on each programming language
+individually, our model outperforms previous best code retrievers of similar
+size by treating code as text. In summary, our model achieves impressive
+results by effectively harnessing multi-stage contrastive learning, offering a
+powerful and efficient text embedding model with broad applicability across
+various NLP and code-related tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniversalNER: Targeted Distillation from Large Language Models for Open
+  Named Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Zhou, Sheng Zhang, Yu Gu, Muhao Chen, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable generalizability,
+such as understanding arbitrary entities and relations. Instruction tuning has
+proven effective for distilling LLMs into more cost-efficient models such as
+Alpaca and Vicuna. Yet such student models still trail the original LLMs by
+large margins in downstream applications. In this paper, we explore targeted
+distillation with mission-focused instruction tuning to train student models
+that can excel in a broad application class such as open information
+extraction. Using named entity recognition (NER) for case study, we show how
+ChatGPT can be distilled into much smaller UniversalNER models for open NER.
+For evaluation, we assemble the largest NER benchmark to date, comprising 43
+datasets across 9 diverse domains such as biomedicine, programming, social
+media, law, finance. Without using any direct supervision, UniversalNER attains
+remarkable NER accuracy across tens of thousands of entity types, outperforming
+general instruction-tuned models such as Alpaca and Vicuna by over 30 absolute
+F1 points in average. With a tiny fraction of parameters, UniversalNER not only
+acquires ChatGPT's capability in recognizing arbitrary entity types, but also
+outperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,
+UniversalNER even outperforms by a large margin state-of-the-art multi-task
+instruction-tuned systems such as InstructUIE, which uses supervised NER
+examples. We also conduct thorough ablation studies to assess the impact of
+various components in our distillation approach. We will release the
+distillation recipe, data, and UniversalNER models to facilitate future
+research on targeted distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://universal-ner.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Ambiguity to Explicitness: NLP-Assisted 5G Specification
+  Abstraction for Formal Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyu Yuan, Jingda Yang, Sudhanshu Arya, Carlo Lipizzi, Ying Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Formal method-based analysis of the 5G Wireless Communication Protocol is
+crucial for identifying logical vulnerabilities and facilitating an
+all-encompassing security assessment, especially in the design phase. Natural
+Language Processing (NLP) assisted techniques and most of the tools are not
+widely adopted by the industry and research community. Traditional formal
+verification through a mathematics approach heavily relied on manual logical
+abstraction prone to being time-consuming, and error-prone. The reason that the
+NLP-assisted method did not apply in industrial research may be due to the
+ambiguity in the natural language of the protocol designs nature is
+controversial to the explicitness of formal verification. To address the
+challenge of adopting the formal methods in protocol designs, targeting (3GPP)
+protocols that are written in natural language, in this study, we propose a
+hybrid approach to streamline the analysis of protocols. We introduce a
+two-step pipeline that first uses NLP tools to construct data and then uses
+constructed data to extract identifiers and formal properties by using the NLP
+model. The identifiers and formal properties are further used for formal
+analysis. We implemented three models that take different dependencies between
+identifiers and formal properties as criteria. Our results of the optimal model
+reach valid accuracy of 39% for identifier extraction and 42% for formal
+properties predictions. Our work is proof of concept for an efficient procedure
+in performing formal analysis for largescale complicate specification and
+protocol analysis, especially for 5G and nextG communications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapter-based Selective Knowledge Distillation for Federated
+  Multi-domain Meeting Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiachong Feng, Xiaocheng Feng, Xiyuan Du, Min-Yen Kan, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meeting summarization has emerged as a promising technique for providing
+users with condensed summaries. However, existing work has focused on training
+models on centralized data, neglecting real-world scenarios where meeting data
+are infeasible to collect centrally, due to their sensitive nature. This gap
+motivates us to explore federated learning for meeting summarization. Two
+critical challenges impede progress. First, state-of-the-art summarizers are
+based on parameter-heavy pre-trained models. Exchanging such a model's
+parameters across clients imposes large bandwidth costs. Second, as real-world
+meeting data belong to various domains and are distributed across clients, they
+are instances of non-identically and independently distributed (non-IID). IID
+assumptions do not hold, which changes which forms of learning algorithms best
+apply. To address this, we propose Adapter-based Federated Selective Knowledge
+Distillation (AdaFedSelecKD) for training performant client models.
+Specifically, we develop an adapter-based summarization model where two
+adapters cooperatively facilitate learning using fewer parameters to reduce
+communication costs. Then, we devise a selective knowledge distillation
+strategy, assisting clients in robustly handling domain-focused modelling on
+their own data, while leveraging global parameters based on non-IID data.
+Extensive experiments on the QMSum benchmark demonstrate AdaFedSelecKD can
+achieve comparable performance with powerful centralized training methods, and
+shows its generalizability and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE TASLP for possible
+  publication. Copyright may be transferred without notice, after which this
+  version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Rule Injection for ComplEx Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodi Ma, Anthony Colas, Yuejie Wang, Ali Sadeghian, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works in neural knowledge graph inference attempt to combine logic
+rules with knowledge graph embeddings to benefit from prior knowledge. However,
+they usually cannot avoid rule grounding, and injecting a diverse set of rules
+has still not been thoroughly explored. In this work, we propose InjEx, a
+mechanism to inject multiple types of rules through simple constraints, which
+capture definite Horn rules. To start, we theoretically prove that InjEx can
+inject such rules. Next, to demonstrate that InjEx infuses interpretable prior
+knowledge into the embedding space, we evaluate InjEx on both the knowledge
+graph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.
+Our experimental results reveal that InjEx outperforms both baseline KGC models
+as well as specialized few-shot models while maintaining its scalability and
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and
+  Effective Hotword Customization Ability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xian Shi, Yexin Yang, Zerui Li, Shiliang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hotword customization is one of the important issues remained in ASR field -
+it is of value to enable users of ASR systems to customize names of entities,
+persons and other phrases. The past few years have seen both implicit and
+explicit modeling strategies for ASR contextualization developed. While these
+approaches have performed adequately, they still exhibit certain shortcomings,
+such as instability in effectiveness, especially in non-autoregressive ASR
+models. In this paper we propose Semantic-augmented Contextual-Paraformer
+(SeACo-Paraformer) a novel NAR based ASR system with flexible and effective
+hotword customization ability. It combines the accuracy of the AED-based model,
+the efficiency of the NAR model, and the excellent performance in
+contextualization. In tens of thousands of hours industrial big data
+experiments, our proposed model outperforms strong baselines in customization
+and general ASR tasks. Besides, we explore an efficient way to filter large
+scale incoming hotwords for further improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>early draft</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaniniQA: Enhancing Patient Education Through Interactive Question
+  Answering <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengshan Cai, Zonghai Yao, Fei Liu, Dakuo Wang, Meghan Reilly, Huixue Zhou, Lingxi Li, Yi Cao, Alok Kapoor, Adarsha Bajracharya, Dan Berlowitz, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Patient portal allows discharged patients to access their personalized
+discharge instructions in electronic health records (EHRs). However, many
+patients have difficulty understanding or memorizing their discharge
+instructions. In this paper, we present PaniniQA, a patient-centric interactive
+question answering system designed to help patients understand their discharge
+instructions. PaniniQA first identifies important clinical content from
+patients' discharge instructions and then formulates patient-specific
+educational questions. In addition, PaniniQA is also equipped with answer
+verification functionality to provide timely feedback to correct patients'
+misunderstandings. Our comprehensive automatic and human evaluation results
+demonstrate our PaniniQA is capable of improving patients' mastery of their
+medical instructions through effective interactions
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TACL 2023. This arXiv version is a pre-MIT Press
+  publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of the Evolution of Advanced <span class="highlight-title">Transformer</span>-Based Language Models:
+  Experiments on Opinion Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nour Eddine Zekaoui, Siham Yousfi, Maryem Rhanoui, Mounia Mikram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opinion mining, also known as sentiment analysis, is a subfield of natural
+language processing (NLP) that focuses on identifying and extracting subjective
+information in textual material. This can include determining the overall
+sentiment of a piece of text (e.g., positive or negative), as well as
+identifying specific emotions or opinions expressed in the text, that involves
+the use of advanced machine and deep learning techniques. Recently,
+transformer-based language models make this task of human emotion analysis
+intuitive, thanks to the attention mechanism and parallel computation. These
+advantages make such models very powerful on linguistic tasks, unlike recurrent
+neural networks that spend a lot of time on sequential processing, making them
+prone to fail when it comes to processing long text. The scope of our paper
+aims to study the behaviour of the cutting-edge Transformer-based language
+models on opinion mining and provide a high-level comparison between them to
+highlight their key particularities. Additionally, our comparative study shows
+leads and paves the way for production engineers regarding the approach to
+focus on and is useful for researchers as it provides guidelines for future
+research subjects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Automated Distractor and Feedback Generation for Math
+  Multiple-choice Questions via In-context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hunter McNichols, Wanyong Feng, Jaewook Lee, Alexander Scarlatos, Digory Smith, Simon Woodhead, Andrew Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple-choice questions (MCQs) are ubiquitous in almost all levels of
+education since they are easy to administer, grade, and are a reliable format
+in both assessments and practices. An important aspect of MCQs is the
+distractors, i.e., incorrect options that are designed to target specific
+misconceptions or insufficient knowledge among students. To date, the task of
+crafting high-quality distractors has largely remained a labor-intensive
+process for teachers and learning content designers, which has limited
+scalability. In this work, we explore the task of automated distractor and
+corresponding feedback message generation in math MCQs using large language
+models. We establish a formulation of these two tasks and propose a simple,
+in-context learning-based solution. Moreover, we explore using two non-standard
+metrics to evaluate the quality of the generated distractors and feedback
+messages. We conduct extensive experiments on these tasks using a real-world
+MCQ dataset that contains student response information. Our findings suggest
+that there is a lot of room for improvement in automated distractor and
+feedback generation. We also outline several directions for future work
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple synthetic data reduces sycophancy in large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Wei, Da Huang, Yifeng Lu, Denny Zhou, Quoc V. Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sycophancy is an undesirable behavior where models tailor their responses to
+follow a human user's view even when that view is not objectively correct
+(e.g., adapting liberal views once a user reveals that they are liberal). In
+this paper, we study the prevalence of sycophancy in language models and
+propose a simple synthetic-data intervention to reduce this behavior.
+  First, on a set of three sycophancy tasks (Perez et al., 2022) where models
+are asked for an opinion on statements with no correct answers (e.g.,
+politics), we observe that both model scaling and instruction tuning
+significantly increase sycophancy for PaLM models up to 540B parameters.
+Second, we extend sycophancy evaluations to simple addition statements that are
+objectively incorrect, finding that despite knowing that these statements are
+wrong, language models will still agree with them if the user does as well.
+  To reduce sycophancy, we present a straightforward synthetic-data
+intervention that takes public NLP tasks and encourages models to be robust to
+user opinions on these tasks. Adding these data in a lightweight finetuning
+step can significantly reduce sycophantic behavior on held-out prompts. Code
+for generating synthetic data for intervention can be found at
+https://github.com/google/sycophancy-intervention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Establishing Trust in Chat<span class="highlight-title">GPT</span> BioMedical Generated Text: An
+  Ontology-Based Knowledge Graph to Validate Disease-Symptom Links 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Abdeen Hamed, Alessandro Crimi, Magdalena M. Misiak, Byung Suk Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods: Through an innovative approach, we construct ontology-based
+knowledge graphs from authentic medical literature and AI-generated content.
+Our goal is to distinguish factual information from unverified data. We
+compiled two datasets: one from biomedical literature using a "human disease
+and symptoms" query, and another generated by ChatGPT, simulating articles.
+With these datasets (PubMed and ChatGPT), we curated 10 sets of 250 abstracts
+each, selected randomly with a specific seed. Our method focuses on utilizing
+disease ontology (DOID) and symptom ontology (SYMP) to build knowledge graphs,
+robust mathematical models that facilitate unbiased comparisons. By employing
+our fact-checking algorithms and network centrality metrics, we conducted GPT
+disease-symptoms link analysis to quantify the accuracy of factual knowledge
+amid noise, hypotheses, and significant findings.
+  Results: The findings obtained from the comparison of diverse ChatGPT
+knowledge graphs with their PubMed counterparts revealed some interesting
+observations. While PubMed knowledge graphs exhibit a wealth of disease-symptom
+terms, it is surprising to observe that some ChatGPT graphs surpass them in the
+number of connections. Furthermore, some GPT graphs are demonstrating supremacy
+of the centrality scores, especially for the overlapping nodes. This striking
+contrast indicates the untapped potential of knowledge that can be derived from
+AI-generated content, awaiting verification. Out of all the graphs, the factual
+link ratio between any two graphs reached its peak at 60%.
+  Conclusions: An intriguing insight from our findings was the striking number
+of links among terms in the knowledge graph generated from ChatGPT datasets,
+surpassing some of those in its PubMed counterpart. This early discovery has
+prompted further investigation using universal network metrics to unveil the
+new knowledge the links may hold.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 Pages, 3 algorithms, 4 tables, and 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Automatic Phonetic Transcription into the International
+  Phonetic Alphabet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chihiro Taguchi, Yusuke Sakai, Parisa Haghani, David Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a state-of-the-art model for transcribing speech in any
+language into the International Phonetic Alphabet (IPA). Transcription of
+spoken languages into IPA is an essential yet time-consuming process in
+language documentation, and even partially automating this process has the
+potential to drastically speed up the documentation of endangered languages.
+Like the previous best speech-to-IPA model (Wav2Vec2Phoneme), our model is
+based on wav2vec 2.0 and is fine-tuned to predict IPA from audio input. We use
+training data from seven languages from CommonVoice 11.0, transcribed into IPA
+semi-automatically. Although this training dataset is much smaller than
+Wav2Vec2Phoneme's, its higher quality lets our model achieve comparable or
+better results. Furthermore, we show that the quality of our universal
+speech-to-IPA models is close to that of human annotators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Assistant Language Understanding On Device 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aas, Hisham Abdelsalam, Irina Belousova, Shruti Bhargava, Jianpeng Cheng, Robert Daland, Joris Driesen, Federico Flego, Tristan Guigue, Anders Johannsen, Partha Lal, Jiarui Lu, Joel Ruben Antony Moniz, Nathan Perkins, Dhivya Piraviperumal, Stephen Pulman, Diarmuid Ó Séaghdha, David Q. Sun, John Torr, Marco Del Vecchio, Jay Wacker, Jason D. Williams, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has recently become feasible to run personal digital assistants on phones
+and other personal devices. In this paper we describe a design for a natural
+language understanding system that runs on device. In comparison to a
+server-based assistant, this system is more private, more reliable, faster,
+more expressive, and more accurate. We describe what led to key choices about
+architecture and technologies. For example, some approaches in the dialog
+systems literature are difficult to maintain over time in a deployment setting.
+We hope that sharing learnings from our practical experiences may help inform
+future work in the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Cross-Domain Evaluation of Approaches for Causal Knowledge Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anik Saha, Oktie Hassanzadeh, Alex Gittens, Jian Ni, Kavitha Srinivas, Bulent Yener
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal knowledge extraction is the task of extracting relevant causes and
+effects from text by detecting the causal relation. Although this task is
+important for language understanding and knowledge discovery, recent works in
+this domain have largely focused on binary classification of a text segment as
+causal or non-causal. In this regard, we perform a thorough analysis of three
+sequence tagging models for causal knowledge extraction and compare it with a
+span based approach to causality extraction. Our experiments show that
+embeddings from pre-trained language models (e.g. BERT) provide a significant
+performance boost on this task compared to previous state-of-the-art models
+with complex architectures. We observe that span based models perform better
+than simple sequence tagging models based on BERT across all 4 data sets from
+diverse domains with different types of cause-effect phrases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Benchmark Creation for Table Union Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koyena Pal, Aamod Khatiwada, Roee Shraga, Renée J. Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data management has traditionally relied on synthetic data generators to
+generate structured benchmarks, like the TPC suite, where we can control
+important parameters like data size and its distribution precisely. These
+benchmarks were central to the success and adoption of database management
+systems. But more and more, data management problems are of a semantic nature.
+An important example is finding tables that can be unioned. While any two
+tables with the same cardinality can be unioned, table union search is the
+problem of finding tables whose union is semantically coherent. Semantic
+problems cannot be benchmarked using synthetic data. Our current methods for
+creating benchmarks involve the manual curation and labeling of real data.
+These methods are not robust or scalable and perhaps more importantly, it is
+not clear how robust the created benchmarks are. We propose to use generative
+AI models to create structured data benchmarks for table union search. We
+present a novel method for using generative models to create tables with
+specified properties. Using this method, we create a new benchmark containing
+pairs of tables that are both unionable and non-unionable but related. We
+thoroughly evaluate recent existing table union search methods over existing
+benchmarks and our new benchmark. We also present and evaluate a new table
+search methods based on recent large language models over all benchmarks. We
+show that the new benchmark is more challenging for all methods than
+hand-curated benchmarks, specifically, the top-performing method achieves a
+Mean Average Precision of around 60%, over 30% less than its performance on
+existing manually created benchmarks. We examine why this is the case and show
+that the new benchmark permits more detailed analysis of methods, including a
+study of both false positives and false negatives that were not possible with
+existing benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Equivalence of e-Commerce Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mandal, Daniel Tunkelang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query variation poses a challenge in e-commerce search, as equivalent
+search intents can be expressed through different queries with surface-level
+differences. This paper introduces a framework to recognize and leverage query
+equivalence to enhance searcher and business outcomes. The proposed approach
+addresses three key problems: mapping queries to vector representations of
+search intent, identifying nearest neighbor queries expressing equivalent or
+similar intent, and optimizing for user or business objectives. The framework
+utilizes both surface similarity and behavioral similarity to determine query
+equivalence. Surface similarity involves canonicalizing queries based on word
+inflection, word order, compounding, and noise words. Behavioral similarity
+leverages historical search behavior to generate vector representations of
+query intent. An offline process is used to train a sentence similarity model,
+while an online nearest neighbor approach supports processing of unseen
+queries. Experimental evaluations demonstrate the effectiveness of the proposed
+approach, outperforming popular sentence transformer models and achieving a
+Pearson correlation of 0.85 for query similarity. The results highlight the
+potential of leveraging historical behavior data and training models to
+recognize and utilize query equivalence in e-commerce search, leading to
+improved user experiences and business outcomes. Further advancements and
+benchmark datasets are encouraged to facilitate the development of solutions
+for this critical problem in the e-commerce domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trusting Language Models in Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jogi Suda Neto, Li Deng, Thejaswi Raya, Reza Shahbazi, Nick Liu, Adhitya Venkatesh, Miral Shah, Neeru Khosla, Rodrigo Capobianco Guido
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models are being widely used in Education. Even though modern deep
+learning models achieve very good performance on question-answering tasks,
+sometimes they make errors. To avoid misleading students by showing wrong
+answers, it is important to calibrate the confidence - that is, the prediction
+probability - of these models. In our work, we propose to use an XGBoost on top
+of BERT to output the corrected probabilities, using features based on the
+attention mechanism. Our hypothesis is that the level of uncertainty contained
+in the flow of attention is related to the quality of the model's response
+itself.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Storyfier: Exploring Vocabulary Learning Support with Text Generation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhui Peng, Xingbo Wang, Qiushi Han, Junkai Zhu, Xiaojuan Ma, Huamin Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vocabulary learning support tools have widely exploited existing materials,
+e.g., stories or video clips, as contexts to help users memorize each target
+word. However, these tools could not provide a coherent context for any target
+words of learners' interests, and they seldom help practice word usage. In this
+paper, we work with teachers and students to iteratively develop Storyfier,
+which leverages text generation models to enable learners to read a generated
+story that covers any target words, conduct a story cloze test, and use these
+words to write a new story with adaptive AI assistance. Our within-subjects
+study (N=28) shows that learners generally favor the generated stories for
+connecting target words and writing assistance for easing their learning
+workload. However, in the read-cloze-write learning sessions, participants
+using Storyfier perform worse in recalling and using target words than learning
+with a baseline tool without our AI features. We discuss insights into
+supporting learning tasks with generative models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 2023 ACM Symposium on User Interface Software and
+  Technology (UIST); 16 pages (7 figures, 23 tables)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extracting detailed oncologic history and treatment plan from medical
+  oncology notes with large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madhumita Sushil, Vanessa E. Kennedy, Brenda Y. Miao, Divneet Mandair, Travis Zack, Atul J. Butte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Both medical care and observational studies in oncology require a thorough
+understanding of a patient's disease progression and treatment history, often
+elaborately documented in clinical notes. Despite their vital role, no current
+oncology information representation and annotation schema fully encapsulates
+the diversity of information recorded within these notes. Although large
+language models (LLMs) have recently exhibited impressive performance on
+various medical natural language processing tasks, due to the current lack of
+comprehensively annotated oncology datasets, an extensive evaluation of LLMs in
+extracting and reasoning with the complex rhetoric in oncology notes remains
+understudied. We developed a detailed schema for annotating textual oncology
+information, encompassing patient characteristics, tumor characteristics,
+tests, treatments, and temporality. Using a corpus of 10 de-identified breast
+cancer progress notes at University of California, San Francisco, we applied
+this schema to assess the abilities of three recently-released LLMs (GPT-4,
+GPT-3.5-turbo, and FLAN-UL2) to perform zero-shot extraction of detailed
+oncological history from two narrative sections of clinical progress notes. Our
+team annotated 2750 entities, 2874 modifiers, and 1623 relationships. The GPT-4
+model exhibited overall best performance, with an average BLEU score of 0.69,
+an average ROUGE score of 0.72, and an average accuracy of 67% on complex tasks
+(expert manual evaluation). Notably, it was proficient in tumor characteristic
+and medication extraction, and demonstrated superior performance in inferring
+symptoms due to cancer and considerations of future medications. The analysis
+demonstrates that GPT-4 is potentially already usable to extract important
+facts from cancer progress notes needed for clinical research, complex
+population management, and documenting quality patient care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code available at:
+  https://github.com/MadhumitaSushil/OncLLMExtraction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossTalk: Intelligent Substrates for Language-Oriented Interaction in
+  Video-Based Communication and Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haijun Xia, Tony Wang, Aditya Gunturu, Peiling Jiang, William Duan, Xiaoshuo Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advances and ubiquity of digital communication media such as
+videoconferencing and virtual reality, they remain oblivious to the rich
+intentions expressed by users. Beyond transmitting audio, videos, and messages,
+we envision digital communication media as proactive facilitators that can
+provide unobtrusive assistance to enhance communication and collaboration.
+Informed by the results of a formative study, we propose three key design
+concepts to explore the systematic integration of intelligence into
+communication and collaboration, including the panel substrate, language-based
+intent recognition, and lightweight interaction techniques. We developed
+CrossTalk, a videoconferencing system that instantiates these concepts, which
+was found to enable a more fluid and flexible communication and collaboration
+experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Explanations: Leveraging Human Input to Align Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09656v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09656v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivian Lai, Yiming Zhang, Chacha Chen, Q. Vera Liao, Chenhao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a vast collection of explainable AI (XAI) algorithms have been
+developed in recent years, they are often criticized for significant gaps with
+how humans produce and consume explanations. As a result, current XAI
+techniques are often found to be hard to use and lack effectiveness. In this
+work, we attempt to close these gaps by making AI explanations selective -- a
+fundamental property of human explanations -- by selectively presenting a
+subset from a large set of model reasons based on what aligns with the
+recipient's preferences. We propose a general framework for generating
+selective explanations by leveraging human input on a small sample. This
+framework opens up a rich design space that accounts for different selectivity
+goals, types of input, and more. As a showcase, we use a decision-support task
+to explore selective explanations based on what the decision-maker would
+consider relevant to the decision task. We conducted two experimental studies
+to examine three out of a broader possible set of paradigms based on our
+proposed framework: in Study 1, we ask the participants to provide their own
+input to generate selective explanations, with either open-ended or
+critique-based input. In Study 2, we show participants selective explanations
+based on input from a panel of similar users (annotators). Our experiments
+demonstrate the promise of selective explanations in reducing over-reliance on
+AI and improving decision outcomes and subjective perceptions of the AI, but
+also paint a nuanced picture that attributes some of these positive effects to
+the opportunity to provide one's own input to augment AI explanations. Overall,
+our work proposes a novel XAI framework inspired by human communication
+behaviors and demonstrates its potentials to encourage future work to better
+align AI explanations with human production and consumption of explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 25 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer
+  using LSTM, BiLSTM, CNN, GRU, and GloVe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham, Jamil Al Shaqsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and
+GloVe to classify gene mutations using Kaggle's Personalized Medicine:
+Redefining Cancer Treatment dataset. The results were compared against
+well-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and
+their LSTM ensembles. Our model outperformed all other models in terms of
+accuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it
+also needed less training time, resulting in a perfect combination of
+performance and efficiency. This study demonstrates the utility of ensemble
+models for difficult tasks such as gene mutation classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Getting pwn'd by AI: Penetration Testing with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00121v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00121v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Happe, Jürgen Cito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of software security testing, more specifically penetration
+testing, is an activity that requires high levels of expertise and involves
+many manual testing and analysis steps. This paper explores the potential usage
+of large-language models, such as GPT3.5, to augment penetration testers with
+AI sparring partners. We explore the feasibility of supplementing penetration
+testers with AI models for two distinct use cases: high-level task planning for
+security testing assignments and low-level vulnerability hunting within a
+vulnerable virtual machine. For the latter, we implemented a closed-feedback
+loop between LLM-generated low-level actions with a vulnerable virtual machine
+(connected through SSH) and allowed the LLM to analyze the machine state for
+vulnerabilities and suggest concrete attack vectors which were automatically
+executed within the virtual machine. We discuss promising initial results,
+detail avenues for improvement, and close deliberating on the ethics of
+providing AI-based sparring partners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, vision paper FSE'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Design of Semantic Similarity Ensembles Using Grammatical
+  Evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00925v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00925v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic similarity measures are widely used in natural language processing
+to catalyze various computer-related tasks. However, no single semantic
+similarity measure is the most appropriate for all tasks, and researchers often
+use ensemble strategies to ensure performance. This research work proposes a
+method for automatically designing semantic similarity ensembles. In fact, our
+proposed method uses grammatical evolution, for the first time, to
+automatically select and aggregate measures from a pool of candidates to create
+an ensemble that maximizes correlation to human judgment. The method is
+evaluated on several benchmark datasets and compared to state-of-the-art
+ensembles, showing that it can significantly improve similarity assessment
+accuracy and outperform existing methods in some cases. As a result, our
+research demonstrates the potential of using grammatical evolution to
+automatically compare text and prove the benefits of using ensembles for
+semantic similarity tasks. The source code that illustrates our approach can be
+downloaded from https://github.com/jorge-martinez-gil/sesige.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ QAmeleon: Multilingual QA with Only 5 Examples <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priyanka Agrawal, Chris Alberti, Fantine Huot, Joshua Maynez, Ji Ma, Sebastian Ruder, Kuzman Ganchev, Dipanjan Das, Mirella Lapata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of large, high-quality datasets has been one of the main
+drivers of recent progress in question answering (QA). Such annotated datasets
+however are difficult and costly to collect, and rarely exist in languages
+other than English, rendering QA technology inaccessible to underrepresented
+languages. An alternative to building large monolingual training datasets is to
+leverage pre-trained language models (PLMs) under a few-shot learning setting.
+Our approach, QAmeleon, uses a PLM to automatically generate multilingual data
+upon which QA models are trained, thus avoiding costly annotation. Prompt
+tuning the PLM for data synthesis with only five examples per language delivers
+accuracy superior to translation-based baselines, bridges nearly 60% of the gap
+between an English-only baseline and a fully supervised upper bound trained on
+almost 50,000 hand labeled examples, and always leads to substantial
+improvements compared to fine-tuning a QA model directly on labeled examples in
+low resource settings. Experiments on the TyDiQA-GoldP and MLQA benchmarks show
+that few-shot prompt tuning for data synthesis scales across languages and is a
+viable alternative to large-scale annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear at Transactions of Association for Computational
+  Linguistics (TACL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TikTalk: A Video-Based Dialogue <span class="highlight-title">Dataset</span> for Multi-Modal Chitchat in Real
+  World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongpeng Lin, Ludan Ruan, Wenke Xia, Peiyu Liu, Jingyuan Wen, Yixin Xu, Di Hu, Ruihua Song, Wayne Xin Zhao, Qin Jin, Zhiwu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To facilitate the research on intelligent and human-like chatbots with
+multi-modal context, we introduce a new video-based multi-modal dialogue
+dataset, called TikTalk. We collect 38K videos from a popular video-sharing
+platform, along with 367K conversations posted by users beneath them. Users
+engage in spontaneous conversations based on their multi-modal experiences from
+watching videos, which helps recreate real-world chitchat context. Compared to
+previous multi-modal dialogue datasets, the richer context types in TikTalk
+lead to more diverse conversations, but also increase the difficulty in
+capturing human interests from intricate multi-modal information to generate
+personalized responses. Moreover, external knowledge is more frequently evoked
+in our dataset. These facts reveal new challenges for multi-modal dialogue
+models. We quantitatively demonstrate the characteristics of TikTalk, propose a
+video-based multi-modal chitchat task, and evaluate several dialogue baselines.
+Experimental results indicate that the models incorporating large language
+models (LLM) can generate more diverse responses, while the model utilizing
+knowledge graphs to introduce external knowledge performs the best overall.
+Furthermore, no existing model can solve all the above challenges well. There
+is still a large room for future improvements, even for LLM with visual
+extensions. Our dataset is available at
+\url{https://ruc-aimind.github.io/projects/TikTalk/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Missing Information, Unresponsive Authors, Experimental Flaws: The
+  Impossibility of Assessing the Reproducibility of Previous Human Evaluations
+  in NLP <span class="chip">EACL2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01633v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01633v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anya Belz, Craig Thomson, Ehud Reiter, Gavin Abercrombie, Jose M. Alonso-Moral, Mohammad Arvan, Anouck Braggaar, Mark Cieliebak, Elizabeth Clark, Kees van Deemter, Tanvi Dinkar, Ondřej Dušek, Steffen Eger, Qixiang Fang, Mingqi Gao, Albert Gatt, Dimitra Gkatzia, Javier González-Corbelle, Dirk Hovy, Manuela Hürlimann, Takumi Ito, John D. Kelleher, Filip Klubicka, Emiel Krahmer, Huiyuan Lai, Chris van der Lee, Yiru Li, Saad Mahamood, Margot Mieskes, Emiel van Miltenburg, Pablo Mosteiro, Malvina Nissim, Natalie Parde, Ondřej Plátek, Verena Rieser, Jie Ruan, Joel Tetreault, Antonio Toral, Xiaojun Wan, Leo Wanner, Lewis Watson, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report our efforts in identifying a set of previous human evaluations in
+NLP that would be suitable for a coordinated study examining what makes human
+evaluations in NLP more/less reproducible. We present our results and findings,
+which include that just 13\% of papers had (i) sufficiently low barriers to
+reproduction, and (ii) enough obtainable information, to be considered for
+reproduction, and that all but one of the experiments we selected for
+reproduction was discovered to have flaws that made the meaningfulness of
+conducting a reproduction questionable. As a result, we had to change our
+coordinated study design from a reproduce approach to a
+standardise-then-reproduce-twice approach. Our overall (negative) finding that
+the great majority of human evaluations in NLP is not repeatable and/or not
+reproducible and/or too flawed to justify reproduction, paints a dire picture,
+but presents an opportunity for a rethink about how to design and report human
+evaluations in NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages plus appendix, 4 tables, 1 figure. To appear at "Workshop on
+  Insights from Negative Results in NLP" (co-located with EACL2023). Updated
+  author list and acknowledgements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One model to rule them all: ranking Slovene summarizers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleš Žagar, Marko Robnik-Šikonja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text summarization is an essential task in natural language processing, and
+researchers have developed various approaches over the years, ranging from
+rule-based systems to neural networks. However, there is no single model or
+approach that performs well on every type of text. We propose a system that
+recommends the most suitable summarization model for a given text. The proposed
+system employs a fully connected neural network that analyzes the input content
+and predicts which summarizer should score the best in terms of ROUGE score for
+a given input. The meta-model selects among four different summarization
+models, developed for the Slovene language, using different properties of the
+input, in particular its Doc2Vec document representation. The four Slovene
+summarization models deal with different challenges associated with text
+summarization in a less-resourced language. We evaluate the proposed SloMetaSum
+model performance automatically and parts of it manually. The results show that
+the system successfully automates the step of manually selecting the best
+model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ General Debiasing for Multimodal Sentiment Analysis <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Sun, Juntong Ni, Wenjie Wang, Liqiang Jing, Yinwei Wei, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing work on Multimodal Sentiment Analysis (MSA) utilizes multimodal
+information for prediction yet unavoidably suffers from fitting the spurious
+correlations between multimodal features and sentiment labels. For example, if
+most videos with a blue background have positive labels in a dataset, the model
+will rely on such correlations for prediction, while "blue background" is not a
+sentiment-related feature. To address this problem, we define a general
+debiasing MSA task, which aims to enhance the Out-Of-Distribution (OOD)
+generalization ability of MSA models by reducing their reliance on spurious
+correlations. To this end, we propose a general debiasing framework based on
+Inverse Probability Weighting (IPW), which adaptively assigns small weights to
+the samples with larger bias (i.e., the severer spurious correlations). The key
+to this debiasing framework is to estimate the bias of each sample, which is
+achieved by two steps: 1) disentangling the robust features and biased features
+in each modality, and 2) utilizing the biased features to estimate the bias.
+Finally, we employ IPW to reduce the effects of large-biased samples,
+facilitating robust feature learning for sentiment prediction. To examine the
+model's generalization ability, we keep the original testing sets on two
+benchmarks and additionally construct multiple unimodal and multimodal OOD
+testing sets. The empirical results demonstrate the superior generalization
+ability of our proposed framework. We have released the code and data to
+facilitate the reproduction https://github.com/Teng-Sun/GEAR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SceneGATE: Scene-Graph based co-Attention networks for TExt visual
+  question answering <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08283v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08283v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiqi Cao, Siwen Luo, Felipe Nunez, Zean Wen, Josiah Poon, Caren Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most TextVQA approaches focus on the integration of objects, scene texts and
+question words by a simple transformer encoder. But this fails to capture the
+semantic relations between different modalities. The paper proposes a Scene
+Graph based co-Attention Network (SceneGATE) for TextVQA, which reveals the
+semantic relations among the objects, Optical Character Recognition (OCR)
+tokens and the question words. It is achieved by a TextVQA-based scene graph
+that discovers the underlying semantics of an image. We created a
+guided-attention module to capture the intra-modal interplay between the
+language and the vision as a guidance for inter-modal interactions. To make
+explicit teaching of the relations between the two modalities, we proposed and
+integrated two attention modules, namely a scene graph-based semantic
+relation-aware attention and a positional relation-aware attention. We
+conducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.
+It is shown that our SceneGATE method outperformed existing ones because of the
+scene graph and its attention modules.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Robotics (Q1, SCI indexed Journal):
+  https://www.mdpi.com/2218-6581/12/4/114</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Claim-Dissector: An Interpretable Fact-Checking System with Joint
+  Re-ranking and Veracity Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.14116v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.14116v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Fajcik, Petr Motlicek, Pavel Smrz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Claim-Dissector: a novel latent variable model for fact-checking
+and analysis, which given a claim and a set of retrieved evidences jointly
+learns to identify: (i) the relevant evidences to the given claim, (ii) the
+veracity of the claim. We propose to disentangle the per-evidence relevance
+probability and its contribution to the final veracity probability in an
+interpretable way -- the final veracity probability is proportional to a linear
+ensemble of per-evidence relevance probabilities. In this way, the individual
+contributions of evidences towards the final predicted probability can be
+identified. In per-evidence relevance probability, our model can further
+distinguish whether each relevant evidence is supporting (S) or refuting (R)
+the claim. This allows to quantify how much the S/R probability contributes to
+the final verdict or to detect disagreeing evidence.
+  Despite its interpretable nature, our system achieves results competitive
+with state-of-the-art on the FEVER dataset, as compared to typical two-stage
+system pipelines, while using significantly fewer parameters. It also sets new
+state-of-the-art on FAVIQ and RealFC datasets. Furthermore, our analysis shows
+that our model can learn fine-grained relevance cues while using coarse-grained
+supervision, and we demonstrate it in 2 ways. (i) We show that our model can
+achieve competitive sentence recall while using only paragraph-level relevance
+supervision. (ii) Traversing towards the finest granularity of relevance, we
+show that our model is capable of identifying relevance at the token level. To
+do this, we present a new benchmark TLR-FEVER focusing on token-level
+interpretability -- humans annotate tokens in relevant evidences they
+considered essential when making their judgment. Then we measure how similar
+are these annotations to the tokens our model is focusing on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated acknowledgement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Rule-based Named Entity Recognition and Relation Extraction for
+  Process Model Generation from Natural Language Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Neuberger, Lars Ackermann, Stefan Jablonski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process-aware information systems offer extensive advantages to companies,
+facilitating planning, operations, and optimization of day-to-day business
+activities. However, the time-consuming but required step of designing formal
+business process models often hampers the potential of these systems. To
+overcome this challenge, automated generation of business process models from
+natural language text has emerged as a promising approach to expedite this
+step. Generally two crucial subtasks have to be solved: extracting
+process-relevant information from natural language and creating the actual
+model. Approaches towards the first subtask are rule based methods, highly
+optimized for specific domains, but hard to adapt to related applications. To
+solve this issue, we present an extension to an existing pipeline, to make it
+entirely data driven. We demonstrate the competitiveness of our improved
+pipeline, which not only eliminates the substantial overhead associated with
+feature engineering and rule definition, but also enables adaptation to
+different datasets, entity and relation types, and new domains. Additionally,
+the largest available dataset (PET) for the first subtask, contains no
+information about linguistic references between mentions of entities in the
+process description. Yet, the resolution of these mentions into a single visual
+element is essential for high quality process models. We propose an extension
+to the PET dataset that incorporates information about linguistic references
+and a corresponding method for resolving them. Finally, we provide a detailed
+analysis of the inherent challenges in the dataset at hand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review for CoopIS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Membership Inference Attacks against Language Models via Neighbourhood
+  Comparison 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justus Mattern, Fatemehsadat Mireshghallah, Zhijing Jin, Bernhard Schölkopf, Mrinmaya Sachan, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Membership Inference attacks (MIAs) aim to predict whether a data sample was
+present in the training data of a machine learning model or not, and are widely
+used for assessing the privacy risks of language models. Most existing attacks
+rely on the observation that models tend to assign higher probabilities to
+their training samples than non-training points. However, simple thresholding
+of the model score in isolation tends to lead to high false-positive rates as
+it does not account for the intrinsic complexity of a sample. Recent work has
+demonstrated that reference-based attacks which compare model scores to those
+obtained from a reference model trained on similar data can substantially
+improve the performance of MIAs. However, in order to train reference models,
+attacks of this kind make the strong and arguably unrealistic assumption that
+an adversary has access to samples closely resembling the original training
+data. Therefore, we investigate their performance in more realistic scenarios
+and find that they are highly fragile in relation to the data distribution used
+to train reference models. To investigate whether this fragility provides a
+layer of safety, we propose and evaluate neighbourhood attacks, which compare
+model scores for a given sample to scores of synthetically generated neighbour
+texts and therefore eliminate the need for access to the training data
+distribution. We show that, in addition to being competitive with
+reference-based attacks that have perfect knowledge about the training data
+distribution, our attack clearly outperforms existing reference-free attacks as
+well as reference-based attacks with imperfect knowledge, which demonstrates
+the need for a reevaluation of the threat model of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAME: Confidence-guided Adaptive Memory Efficient Optimization <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02047v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02047v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Luo, Xiaozhe Ren, Zangwei Zheng, Zhuo Jiang, Xin Jiang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive gradient methods, such as Adam and LAMB, have demonstrated excellent
+performance in the training of large language models. Nevertheless, the need
+for adaptivity requires maintaining second-moment estimates of the
+per-parameter gradients, which entails a high cost of extra memory overheads.
+To solve this problem, several memory-efficient optimizers (e.g., Adafactor)
+have been proposed to obtain a drastic reduction in auxiliary memory usage, but
+with a performance penalty. In this paper, we first study a confidence-guided
+strategy to reduce the instability of existing memory efficient optimizers.
+Based on this strategy, we propose CAME to simultaneously achieve two goals:
+fast convergence as in traditional adaptive methods, and low memory usage as in
+memory-efficient methods. Extensive experiments demonstrate the training
+stability and superior performance of CAME across various NLP tasks such as
+BERT and GPT-2 training. Notably, for BERT pre-training on the large batch size
+of 32,768, our proposed optimizer attains faster convergence and higher
+accuracy compared with the Adam optimizer. The implementation of CAME is
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-Modification Adversarial Attacks for Natural Language Processing:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.00676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.00676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Roth, Yansong Gao, Alsharif Abuadbba, Surya Nepal, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are now many adversarial attacks for natural language processing
+systems. Of these, a vast majority achieve success by modifying individual
+document tokens, which we call here a token-modification attack. Each
+token-modification attack is defined by a specific combination of fundamental
+components, such as a constraint on the adversary or a particular search
+algorithm. Motivated by this observation, we survey existing token-modification
+attacks and extract the components of each. We use an attack-independent
+framework to structure our survey which results in an effective categorisation
+of the field and an easy comparison of components. This survey aims to guide
+new researchers to this field and spark further research into individual attack
+components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version 2: updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defending against Insertion-based Textual Backdoor Attacks via
+  Attribution <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhao Li, Zhuofeng Wu, Wei Ping, Chaowei Xiao, V. G. Vinod Vydiswaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textual backdoor attack, as a novel attack model, has been shown to be
+effective in adding a backdoor to the model during training. Defending against
+such backdoor attacks has become urgent and important. In this paper, we
+propose AttDef, an efficient attribution-based pipeline to defend against two
+insertion-based poisoning attacks, BadNL and InSent. Specifically, we regard
+the tokens with larger attribution scores as potential triggers since larger
+attribution words contribute more to the false prediction results and therefore
+are more likely to be poison triggers. Additionally, we further utilize an
+external pre-trained language model to distinguish whether input is poisoned or
+not. We show that our proposed method can generalize sufficiently well in two
+common attack scenarios (poisoning training data and testing data), which
+consistently improves previous methods. For instance, AttDef can successfully
+mitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%
+(3.99% up) under pre-training and post-training attack defense respectively,
+achieving the new state-of-the-art performance on prediction recovery over four
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2023. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Multi-modal and Multi-hop Question Answering via Structured
+  Knowledge and Unified Retrieval-Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Yang, Qian Chen, Wen Wang, Baotian Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal multi-hop question answering involves answering a question by
+reasoning over multiple input sources from different modalities. Existing
+methods often retrieve evidences separately and then use a language model to
+generate an answer based on the retrieved evidences, and thus do not adequately
+connect candidates and are unable to model the interdependent relations during
+retrieval. Moreover, the pipelined approaches of retrieval and generation might
+result in poor generation performance when retrieval performance is low. To
+address these issues, we propose a Structured Knowledge and Unified
+Retrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion
+Encoder to align sources from different modalities using shared entities. It
+then uses a unified Retrieval-Generation Decoder to integrate intermediate
+retrieval results for answer generation and also adaptively determine the
+number of retrieval steps. Extensive experiments on two representative
+multi-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG
+outperforms the state-of-the-art models in both source retrieval and answer
+generation performance with fewer parameters. Our code is available at
+https://github.com/HITsz-TMG/SKURG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case
+  Study in Oncology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cliff Wong, Sheng Zhang, Yu Gu, Christine Moung, Jacob Abel, Naoto Usuyama, Roshanthi Weerasinghe, Brian Piening, Tristan Naumann, Carlo Bifulco, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trial matching is a key process in health delivery and discovery. In
+practice, it is plagued by overwhelming unstructured data and unscalable manual
+processing. In this paper, we conduct a systematic study on scaling clinical
+trial matching using large language models (LLMs), with oncology as the focus
+area. Our study is grounded in a clinical trial matching system currently in
+test deployment at a large U.S. health network. Initial findings are promising:
+out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate
+eligibility criteria of clinical trials and extract complex matching logic
+(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially
+outperform prior strong baselines and may serve as a preliminary solution to
+help triage patient-trial candidates with humans in the loop. Our study also
+reveals a few significant growth areas for applying LLMs to end-to-end clinical
+trial matching, such as context limitation and accuracy, especially in
+structuring patient information from longitudinal medical records.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted at Machine Learning for Healthcare
+  (MLHC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Representation Learning for Automatic Speech Recognition <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad V Ramesh, Gopinath Chennupati, Milind Rao, Anit Kumar Sahu, Ariya Rastrow, Jasha Droppo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge
+devices to learn collaboratively without sharing data. Edge devices like Alexa
+and Siri are prospective sources of unlabeled audio data that can be tapped to
+learn robust audio representations. In this work, we bring Self-supervised
+Learning (SSL) and FL together to learn representations for Automatic Speech
+Recognition respecting data privacy constraints. We use the speaker and chapter
+information in the unlabeled speech dataset, Libri-Light, to simulate non-IID
+speaker-siloed data distributions and pre-train an LSTM encoder with the
+Contrastive Predictive Coding framework with FedSGD. We show that the
+pre-trained ASR encoder in FL performs as well as a centrally pre-trained model
+and produces an improvement of 12-15% (WER) compared to no pre-training. We
+further adapt the federated pre-trained models to a new language, French, and
+show a 20% (WER) improvement over no pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy
+  in Speech Communication, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">117</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Motion Magnification: Visualizing Subtle Motions with Time Varying
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Y. Feng, Hadi Alzayer, Michael Rubinstein, William T. Freeman, Jia-Bin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion magnification helps us visualize subtle, imperceptible motion.
+However, prior methods only work for 2D videos captured with a fixed camera. We
+present a 3D motion magnification method that can magnify subtle motions from
+scenes captured by a moving camera, while supporting novel view rendering. We
+represent the scene with time-varying radiance fields and leverage the Eulerian
+principle for motion magnification to extract and amplify the variation of the
+embedding of a fixed point over time. We study and validate our proposed
+principle for 3D motion magnification using both implicit and tri-plane-based
+radiance fields as our underlying 3D scene representation. We evaluate the
+effectiveness of our method on both synthetic and real-world scenes captured
+under various camera setups.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. See the project page at
+  https://3d-motion-magnification.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FSD V2: Improving Fully Sparse 3D Object Detection with Virtual Voxels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lue Fan, Feng Wang, Naiyan Wang, Zhaoxiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR-based fully sparse architecture has garnered increasing attention.
+FSDv1 stands out as a representative work, achieving impressive efficacy and
+efficiency, albeit with intricate structures and handcrafted designs. In this
+paper, we present FSDv2, an evolution that aims to simplify the previous FSDv1
+while eliminating the inductive bias introduced by its handcrafted
+instance-level representation, thus promoting better general applicability. To
+this end, we introduce the concept of \textbf{virtual voxels}, which takes over
+the clustering-based instance segmentation in FSDv1. Virtual voxels not only
+address the notorious issue of the Center Feature Missing problem in fully
+sparse detectors but also endow the framework with a more elegant and
+streamlined approach. Consequently, we develop a suite of components to
+complement the virtual voxel concept, including a virtual voxel encoder, a
+virtual voxel mixer, and a virtual voxel assignment strategy. Through empirical
+validation, we demonstrate that the virtual voxel mechanism is functionally
+similar to the handcrafted clustering in FSDv1 while being more general. We
+conduct experiments on three large-scale datasets: Waymo Open Dataset,
+Argoverse 2 dataset, and nuScenes dataset. Our results showcase
+state-of-the-art performance on all three datasets, highlighting the
+superiority of FSDv2 in long-range scenarios and its general applicability to
+achieve competitive performance across diverse scenarios. Moreover, we provide
+comprehensive experimental analysis to elucidate the workings of FSDv2. To
+foster reproducibility and further research, we have open-sourced FSDv2 at
+https://github.com/tusen-ai/SST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mask Frozen-DETR: High Quality Instance Segmentation with One GPU 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhao Liang, Yuhui Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we aim to study how to build a strong instance segmenter with
+minimal training time and GPUs, as opposed to the majority of current
+approaches that pursue more accurate instance segmenter by building more
+advanced frameworks at the cost of longer training time and higher GPU
+requirements. To achieve this, we introduce a simple and general framework,
+termed Mask Frozen-DETR, which can convert any existing DETR-based object
+detection model into a powerful instance segmentation model. Our method only
+requires training an additional lightweight mask network that predicts instance
+masks within the bounding boxes given by a frozen DETR-based object detector.
+Remarkably, our method outperforms the state-of-the-art instance segmentation
+method Mask DINO in terms of performance on the COCO test-dev split (55.3% vs.
+54.7%) while being over 10X times faster to train. Furthermore, all of our
+experiments can be trained using only one Tesla V100 GPU with 16 GB of memory,
+demonstrating the significant efficiency of our proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tiny LVLM-eHub: Early Multimodal Experiments with Bard 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Shao, Yutao Hu, Peng Gao, Meng Lei, Kaipeng Zhang, Fanqing Meng, Peng Xu, Siyuan Huang, Hongsheng Li, Yu Qiao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated
+significant progress in tackling complex multimodal tasks. Among these
+cutting-edge developments, Google's Bard stands out for its remarkable
+multimodal capabilities, promoting comprehensive comprehension and reasoning
+across various domains. This work presents an early and holistic evaluation of
+LVLMs' multimodal abilities, with a particular focus on Bard, by proposing a
+lightweight variant of LVLM-eHub, named Tiny LVLM-eHub. In comparison to the
+vanilla version, Tiny LVLM-eHub possesses several appealing properties.
+Firstly, it provides a systematic assessment of six categories of multimodal
+capabilities, including visual perception, visual knowledge acquisition, visual
+reasoning, visual commonsense, object hallucination, and embodied intelligence,
+through quantitative evaluation of $42$ standard text-related visual
+benchmarks. Secondly, it conducts an in-depth analysis of LVLMs' predictions
+using the ChatGPT Ensemble Evaluation (CEE), which leads to a robust and
+accurate evaluation and exhibits improved alignment with human evaluation
+compared to the word matching approach. Thirdly, it comprises a mere $2.1$K
+image-text pairs, facilitating ease of use for practitioners to evaluate their
+own offline LVLMs. Through extensive experimental analysis, this study
+demonstrates that Bard outperforms previous LVLMs in most multimodal
+capabilities except object hallucination, to which Bard is still susceptible.
+Tiny LVLM-eHub serves as a baseline evaluation for various LVLMs and encourages
+innovative strategies aimed at advancing multimodal techniques. Our project is
+publicly available at \url{https://github.com/OpenGVLab/Multi-Modality-Arena}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 24 figures, 7 Tables. Project Page:
+  http://lvlm-ehub.opengvlab.com/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaptiveSAM: Towards Efficient Tuning of SAM for Surgical Scene
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jay N. Paranjape, Nithin Gopalakrishnan Nair, Shameema Sikder, S. Swaroop Vedula, Vishal M. Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation is a fundamental problem in surgical scene analysis using
+artificial intelligence. However, the inherent data scarcity in this domain
+makes it challenging to adapt traditional segmentation techniques for this
+task. To tackle this issue, current research employs pretrained models and
+finetunes them on the given data. Even so, these require training deep networks
+with millions of parameters every time new data becomes available. A recently
+published foundation model, Segment-Anything (SAM), generalizes well to a large
+variety of natural images, hence tackling this challenge to a reasonable
+extent. However, SAM does not generalize well to the medical domain as is
+without utilizing a large amount of compute resources for fine-tuning and using
+task-specific prompts. Moreover, these prompts are in the form of
+bounding-boxes or foreground/background points that need to be annotated
+explicitly for every image, making this solution increasingly tedious with
+higher data size. In this work, we propose AdaptiveSAM - an adaptive
+modification of SAM that can adjust to new datasets quickly and efficiently,
+while enabling text-prompted segmentation. For finetuning AdaptiveSAM, we
+propose an approach called bias-tuning that requires a significantly smaller
+number of trainable parameters than SAM (less than 2\%). At the same time,
+AdaptiveSAM requires negligible expert intervention since it uses free-form
+text as prompt and can segment the object of interest with just the label name
+as prompt. Our experiments show that AdaptiveSAM outperforms current
+state-of-the-art methods on various medical imaging datasets including surgery,
+ultrasound and X-ray. Code is available at
+https://github.com/JayParanjape/biastuning
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Temporal Sentence Grounding in Videos with Multi-Teacher
+  Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renjie Liang, Yiming Yang, Hui Lu, Li Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal Sentence Grounding in Videos (TSGV) aims to detect the event
+timestamps described by the natural language query from untrimmed videos. This
+paper discusses the challenge of achieving efficient computation in TSGV models
+while maintaining high performance. Most existing approaches exquisitely design
+complex architectures to improve accuracy with extra layers and loss, suffering
+from inefficiency and heaviness. Although some works have noticed that, they
+only make an issue of feature fusion layers, which can hardly enjoy the
+highspeed merit in the whole clunky network. To tackle this problem, we propose
+a novel efficient multi-teacher model (EMTM) based on knowledge distillation to
+transfer diverse knowledge from both heterogeneous and isomorphic networks.
+Specifically, We first unify different outputs of the heterogeneous models into
+one single form. Next, a Knowledge Aggregation Unit (KAU) is built to acquire
+high-quality integrated soft labels from multiple teachers. After that, the KAU
+module leverages the multi-scale video and global query information to
+adaptively determine the weights of different teachers. A Shared Encoder
+strategy is then proposed to solve the problem that the student shallow layers
+hardly benefit from teachers, in which an isomorphic teacher is collaboratively
+trained with the student to align their hidden states. Extensive experimental
+results on three popular TSGV benchmarks demonstrate that our method is both
+effective and efficient without bells and whistles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dimensionality Reduction for Improving Out-of-Distribution Detection in
+  Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        McKell Woodland, Nihil Patel, Mais Al Taie, Joshua P. Yung, Tucker J. Netherton, Ankit B. Patel, Kristy K. Brock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinically deployed segmentation models are known to fail on data outside of
+their training distribution. As these models perform well on most cases, it is
+imperative to detect out-of-distribution (OOD) images at inference to protect
+against automation bias. This work applies the Mahalanobis distance post hoc to
+the bottleneck features of a Swin UNETR model that segments the liver on
+T1-weighted magnetic resonance imaging. By reducing the dimensions of the
+bottleneck features with principal component analysis, OOD images were detected
+with high performance and minimal computational load.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has not undergone peer review or any post-submission
+  improvements or corrections. The Version of Record of this contribution will
+  be published in the Proceedings of Uncertainty for Safe Utilization of
+  Machine Learning in Medical Imaging (5th International Workshop) - Held in
+  conjunction with MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEM-GAT: Explainable Semantic Pose Estimation using Learned Graph
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Efimia Panagiotaki, Daniele De Martini, Georgi Pramatarov, Matthew Gadd, Lars Kunze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a GNN-based method for exploiting semantics and local
+geometry to guide the identification of reliable pointcloud registration
+candidates. Semantic and morphological features of the environment serve as key
+reference points for registration, enabling accurate lidar-based pose
+estimation. Our novel lightweight static graph structure informs our
+attention-based keypoint node aggregation GNN network by identifying semantic
+instance-based relationships, acting as inductive bias to significantly reduce
+the computational burden of pointcloud registration. By connecting candidate
+nodes and exploiting cross-graph attention, we identify confidence scores for
+all potential registration correspondences, estimating the displacement between
+pointcloud scans. Our pipeline enables introspective analysis of the model's
+performance by correlating it with the individual contributions of local
+structures in the environment, providing valuable insights into the system's
+behaviour. We test our method on the KITTI odometry dataset, achieving
+competitive accuracy compared to benchmark methods and a higher track
+smoothness while relying on significantly fewer network parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Real Time Delineation of Supraclavicular Brachial Plexus in
+  Neck Ultrasonography Videos: A Deep Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhay Tyagi, Abhishek Tyagi, Manpreet Kaur, Jayanthi Sivaswami, Richa Aggarwal, Kapil Dev Soni, Anjan Trikha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Peripheral nerve blocks are crucial to treatment of post-surgical pain and
+are associated with reduction in perioperative opioid use and hospital stay.
+Accurate interpretation of sono-anatomy is critical for the success of
+ultrasound (US) guided peripheral nerve blocks and can be challenging to the
+new operators. This prospective study enrolled 227 subjects who were
+systematically scanned for supraclavicular and interscalene brachial plexus in
+various settings using three different US machines to create a dataset of 227
+unique videos. In total, 41,000 video frames were annotated by experienced
+anaesthesiologists using partial automation with object tracking and active
+contour algorithms. Four baseline neural network models were trained on the
+dataset and their performance was evaluated for object detection and
+segmentation tasks. Generalizability of the best suited model was then tested
+on the datasets constructed from separate US scanners with and without
+fine-tuning. The results demonstrate that deep learning models can be leveraged
+for real time segmentation of supraclavicular brachial plexus in neck
+ultrasonography videos with high accuracy and reliability. Model was also
+tested for its ability to differentiate between supraclavicular and adjoining
+interscalene brachial plexus. The entire dataset has been released publicly for
+further study by the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling may be all you need for achieving human-level object recognition
+  capacity with human-like visual experience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Emin Orhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper asks whether current self-supervised learning methods, if
+sufficiently scaled up, would be able to reach human-level visual object
+recognition capabilities with the same type and amount of visual experience
+humans learn from. Previous work on this question only considered the scaling
+of data size. Here, we consider the simultaneous scaling of data size, model
+size, and image resolution. We perform a scaling experiment with vision
+transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K
+hours of human-like video data (long, continuous, mostly egocentric videos)
+with image resolutions of up to 476x476 pixels. The efficiency of masked
+autoencoders (MAEs) as a self-supervised learning algorithm makes it possible
+to run this scaling experiment on an unassuming academic budget. We find that
+it is feasible to reach human-level object recognition capacity at sub-human
+scales of model size, data size, and image size, if these factors are scaled up
+simultaneously. To give a concrete example, we estimate that a 2.5B parameter
+ViT model trained with 20K hours (2.3 years) of human-like video data with a
+spatial resolution of 952x952 pixels should be able to reach human-level
+accuracy on ImageNet. Human-level competence is thus achievable for a
+fundamental perceptual capability from human-like perceptual experience
+(human-like in both amount and type) with extremely generic learning algorithms
+and architectures and without any substantive inductive biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 tables; code & models available from
+  https://github.com/eminorhan/humanlike-vits</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prototype Learning for Out-of-Distribution Polyp Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kumar Tomar, Debesh Jha, Ulas Bagci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing polyp segmentation models from colonoscopy images often fail to
+provide reliable segmentation results on datasets from different centers,
+limiting their applicability. Our objective in this study is to create a robust
+and well-generalized segmentation model named PrototypeLab that can assist in
+polyp segmentation. To achieve this, we incorporate various lighting modes such
+as White light imaging (WLI), Blue light imaging (BLI), Linked color imaging
+(LCI), and Flexible spectral imaging color enhancement (FICE) into our new
+segmentation model, that learns to create prototypes for each class of object
+present in the images. These prototypes represent the characteristic features
+of the objects, such as their shape, texture, color. Our model is designed to
+perform effectively on out-of-distribution (OOD) datasets from multiple
+centers. We first generate a coarse mask that is used to learn prototypes for
+the main object class, which are then employed to generate the final
+segmentation mask. By using prototypes to represent the main class, our
+approach handles the variability present in the medical images and generalize
+well to new data since prototype capture the underlying distribution of the
+data. PrototypeLab offers a promising solution with a dice coefficient of
+$\geq$ 90\% and mIoU $\geq$ 85\% with a near real-time processing speed for
+polyp segmentation. It achieved superior performance on OOD datasets compared
+to 16 state-of-the-art image segmentation architectures, potentially improving
+clinical outcomes. Codes are available at
+https://github.com/xxxxx/PrototypeLab.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-based Person Re-identification with Long Short-Term Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuehu Liu, Pingping Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based person Re-Identification (V-ReID) aims to retrieve specific
+persons from raw videos captured by non-overlapped cameras. As a fundamental
+task, it spreads many multimedia and computer vision applications. However, due
+to the variations of persons and scenes, there are still many obstacles that
+must be overcome for high performance. In this work, we notice that both the
+long-term and short-term information of persons are important for robust video
+representations. Thus, we propose a novel deep learning framework named Long
+Short-Term Representation Learning (LSTRL) for effective V-ReID. More
+specifically, to extract long-term representations, we propose a
+Multi-granularity Appearance Extractor (MAE), in which four granularity
+appearances are effectively captured across multiple frames. Meanwhile, to
+extract short-term representations, we propose a Bi-direction Motion Estimator
+(BME), in which reciprocal motion information is efficiently extracted from
+consecutive frames. The MAE and BME are plug-and-play and can be easily
+inserted into existing networks for efficient feature learning. As a result,
+they significantly improve the feature representation ability for V-ReID.
+Extensive experiments on three widely used benchmarks show that our proposed
+approach can deliver better performances than most state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ICIG2023, including 13 pages, 5 figures and
+  5 tables. Modifications may be performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Screen-based 3D Subjective Experiment Software 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Fan, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, widespread 3D graphics (e.g., point clouds and meshes) have drawn
+considerable efforts from academia and industry to assess their perceptual
+quality by conducting subjective experiments. However, lacking a handy software
+for 3D subjective experiments complicates the construction of 3D graphics
+quality assessment datasets, thus hindering the prosperity of relevant fields.
+In this paper, we develop a powerful platform with which users can flexibly
+design their 3D subjective methodologies and build high-quality datasets,
+easing a broad spectrum of 3D graphics subjective quality study. To accurately
+illustrate the perceptual quality differences of 3D stimuli, our software can
+simultaneously render the source stimulus and impaired stimulus and allows both
+stimuli to respond synchronously to viewer interactions. Compared with amateur
+3D visualization tool-based or image/video rendering-based schemes, our
+approach embodies typical 3D applications while minimizing cognitive overload
+during subjective experiments. We organized a subjective experiment involving
+40 participants to verify the validity of the proposed software. Experimental
+analyses demonstrate that subjective tests on our software can produce
+reasonable subjective quality scores of 3D models. All resources in this paper
+can be found at https://openi.pcl.ac.cn/OpenDatasets/3DQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Concise and Descriptive Attributes for Visual Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Yan, Yu Wang, Yiwu Zhong, Chengyu Dong, Zexue He, Yujie Lu, William Wang, Jingbo Shang, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in foundation models present new opportunities for
+interpretable visual recognition -- one can first query Large Language Models
+(LLMs) to obtain a set of attributes that describe each class, then apply
+vision-language models to classify images via these attributes. Pioneering work
+shows that querying thousands of attributes can achieve performance competitive
+with image features. However, our further investigation on 8 datasets reveals
+that LLM-generated attributes in a large quantity perform almost the same as
+random words. This surprising finding suggests that significant noise may be
+present in these attributes. We hypothesize that there exist subsets of
+attributes that can maintain the classification performance with much smaller
+sizes, and propose a novel learning-to-search method to discover those concise
+sets of attributes. As a result, on the CUB dataset, our method achieves
+performance close to that of massive LLM-generated attributes (e.g., 10k
+attributes for CUB), yet using only 32 attributes in total to distinguish 200
+bird species. Furthermore, our new paradigm demonstrates several additional
+benefits: higher interpretability and interactivity for humans, and the ability
+to summarize knowledge for a recognition task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving FHB Screening in Wheat Breeding Using an Efficient <span class="highlight-title">Transformer</span>
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Babak Azad, Ahmed Abdalla, Kwanghee Won, Ali Mirzakhani Nafchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fusarium head blight is a devastating disease that causes significant
+economic losses annually on small grains. Efficiency, accuracy, and timely
+detection of FHB in the resistance screening are critical for wheat and barley
+breeding programs. In recent years, various image processing techniques have
+been developed using supervised machine learning algorithms for the early
+detection of FHB. The state-of-the-art convolutional neural network-based
+methods, such as U-Net, employ a series of encoding blocks to create a local
+representation and a series of decoding blocks to capture the semantic
+relations. However, these methods are not often capable of long-range modeling
+dependencies inside the input data, and their ability to model multi-scale
+objects with significant variations in texture and shape is limited. Vision
+transformers as alternative architectures with innate global self-attention
+mechanisms for sequence-to-sequence prediction, due to insufficient low-level
+details, may also limit localization capabilities. To overcome these
+limitations, a new Context Bridge is proposed to integrate the local
+representation capability of the U-Net network in the transformer model. In
+addition, the standard attention mechanism of the original transformer is
+replaced with Efficient Self-attention, which is less complicated than other
+state-of-the-art methods. To train the proposed network, 12,000 wheat images
+from an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were
+captured. In addition to healthy and unhealthy plants, these images encompass
+various stages of the disease. A team of expert pathologists annotated the
+images for training and evaluating the developed model. As a result, the
+effectiveness of the transformer-based method for FHB-disease detection,
+through extensive experiments across typical tasks for plant image
+segmentation, is demonstrated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual
+  International Meeting conference in Omaha, Nebraska. Also available at
+  https://elibrary.asabe.org/abstract.asp?aid=54149</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FFF: Fragments-Guided Flexible Fitting for Building Complete Protein
+  Structures <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Chen, Xinyan Wang, Yuhang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryo-electron microscopy (cryo-EM) is a technique for reconstructing the
+3-dimensional (3D) structure of biomolecules (especially large protein
+complexes and molecular assemblies). As the resolution increases to the
+near-atomic scale, building protein structures de novo from cryo-EM maps
+becomes possible. Recently, recognition-based de novo building methods have
+shown the potential to streamline this process. However, it cannot build a
+complete structure due to the low signal-to-noise ratio (SNR) problem. At the
+same time, AlphaFold has led to a great breakthrough in predicting protein
+structures. This has inspired us to combine fragment recognition and structure
+prediction methods to build a complete structure. In this paper, we propose a
+new method named FFF that bridges protein structure prediction and protein
+structure recognition with flexible fitting. First, a multi-level recognition
+network is used to capture various structural features from the input 3D
+cryo-EM map. Next, protein structural fragments are generated using pseudo
+peptide vectors and a protein sequence alignment method based on these
+extracted features. Finally, a complete structural model is constructed using
+the predicted protein fragments via flexible fitting. Based on our benchmark
+tests, FFF outperforms the baseline methods for building complete protein
+structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Proceedings of the IEEE/CVF Conference on Computer
+  Vision and Pattern Recognition (CVPR), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WarpEM: Dynamic Time Warping for Accurate Catheter Registration in
+  EM-guided Procedures <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ardit Ramadani, Peter Ewert, Heribert Schunkert, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate catheter tracking is crucial during minimally invasive endovascular
+procedures (MIEP), and electromagnetic (EM) tracking is a widely used
+technology that serves this purpose. However, registration between preoperative
+images and the EM tracking system is often challenging. Existing registration
+methods typically require manual interactions, which can be time-consuming,
+increase the risk of errors and change the procedural workflow. Although
+several registration methods are available for catheter tracking, such as
+marker-based and path-based approaches, their limitations can impact the
+accuracy of the resulting tracking solution, consequently, the outcome of the
+medical procedure.
+  This paper introduces a novel automated catheter registration method for
+EM-guided MIEP. The method utilizes 3D signal temporal analysis, such as
+Dynamic Time Warping (DTW) algorithms, to improve registration accuracy and
+reliability compared to existing methods. DTW can accurately warp and match
+EM-tracked paths to the vessel's centerline, making it particularly suitable
+for registration. The introduced registration method is evaluated for accuracy
+in a vascular phantom using a marker-based registration as the ground truth.
+The results indicate that the DTW method yields accurate and reliable
+registration outcomes, with a mean error of $2.22$mm. The introduced
+registration method presents several advantages over state-of-the-art methods,
+such as high registration accuracy, no initialization required, and increased
+automation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 26th International Conference on Medical Image Computing and
+  Computer Assisted Intervention, MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segmentation Framework for Heat Loss Identification in Thermal Images:
+  Empowering Scottish Retrofitting and Thermographic <span class="highlight-title">Survey</span> Companies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Junayed Hasan, Eyad Elyan, Yijun Yan, Jinchang Ren, Md Mostafa Kamal Sarker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrofitting and thermographic survey (TS) companies in Scotland collaborate
+with social housing providers to tackle fuel poverty. They employ ground-level
+infrared (IR) camera-based-TSs (GIRTSs) for collecting thermal images to
+identi-fy the heat loss sources resulting from poor insulation. However, this
+identifica-tion process is labor-intensive and time-consuming, necessitating
+extensive data processing. To automate this, an AI-driven approach is
+necessary. Therefore, this study proposes a deep learning (DL)-based
+segmentation framework using the Mask Region Proposal Convolutional Neural
+Network (Mask RCNN) to validate its applicability to these thermal images. The
+objective of the framework is to au-tomatically identify, and crop heat loss
+sources caused by weak insulation, while also eliminating obstructive objects
+present in those images. By doing so, it min-imizes labor-intensive tasks and
+provides an automated, consistent, and reliable solution. To validate the
+proposed framework, approximately 2500 thermal imag-es were collected in
+collaboration with industrial TS partner. Then, 1800 repre-sentative images
+were carefully selected with the assistance of experts and anno-tated to
+highlight the target objects (TO) to form the final dataset. Subsequently, a
+transfer learning strategy was employed to train the dataset, progressively
+aug-menting the training data volume and fine-tuning the pre-trained baseline
+Mask RCNN. As a result, the final fine-tuned model achieved a mean average
+precision (mAP) score of 77.2% for segmenting the TO, demonstrating the
+significant po-tential of proposed framework in accurately quantifying energy
+loss in Scottish homes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 3 Figures, Accepted from the conference - BICS 2023: 2023
+  International Conference on Brain-Inspired Cognitive Systems Kuala Lumpur,
+  Malaysia, August 5-6, 2023 [peer-reviewed]</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOMA-Force: Visual-Force Imitation for Real-World Mobile Manipulation <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taozheng Yang, Ya Jing, Hongtao Wu, Jiafeng Xu, Kuankuan Sima, Guangzeng Chen, Qie Sima, Tao Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel method for mobile manipulators to perform
+multiple contact-rich manipulation tasks. While learning-based methods have the
+potential to generate actions in an end-to-end manner, they often suffer from
+insufficient action accuracy and robustness against noise. On the other hand,
+classical control-based methods can enhance system robustness, but at the cost
+of extensive parameter tuning. To address these challenges, we present
+MOMA-Force, a visual-force imitation method that seamlessly combines
+representation learning for perception, imitation learning for complex motion
+generation, and admittance whole-body control for system robustness and
+controllability. MOMA-Force enables a mobile manipulator to learn multiple
+complex contact-rich tasks with high success rates and small contact forces. In
+a real household setting, our method outperforms baseline methods in terms of
+task success rates. Moreover, our method achieves smaller contact forces and
+smaller force variances compared to baseline methods without force imitation.
+Overall, we offer a promising approach for efficient and robust mobile
+manipulation in the real world. Videos and more details can be found on
+\url{https://visual-force-imitation.github.io}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots and Systems
+  (IROS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Visual <span class="highlight-title">Pre-train</span>ing for Robot Manipulation: <span class="highlight-title">Dataset</span>s, Models
+  and Methods <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ya Jing, Xuelin Zhu, Xingbin Liu, Qie Sima, Taozheng Yang, Yunhai Feng, Tao Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual pre-training with large-scale real-world data has made great progress
+in recent years, showing great potential in robot learning with pixel
+observations. However, the recipes of visual pre-training for robot
+manipulation tasks are yet to be built. In this paper, we thoroughly
+investigate the effects of visual pre-training strategies on robot manipulation
+tasks from three fundamental perspectives: pre-training datasets, model
+architectures and training methods. Several significant experimental findings
+are provided that are beneficial for robot learning. Further, we propose a
+visual pre-training scheme for robot manipulation termed Vi-PRoM, which
+combines self-supervised learning and supervised learning. Concretely, the
+former employs contrastive learning to acquire underlying patterns from
+large-scale unlabeled data, while the latter aims learning visual semantics and
+temporal dynamics. Extensive experiments on robot manipulations in various
+simulation environments and the real robot demonstrate the superiority of the
+proposed scheme. Videos and more details can be found on
+\url{https://explore-pretrain-robot.github.io}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots and Systems
+  (IROS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous
+  Labels <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengming Lin, Yan Xia, Nishant Ravikumar, Qiongyao Liu, Michael MacRaild, Alejandro F Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of brain vessels is crucial for cerebrovascular disease
+diagnosis and treatment. However, existing methods face challenges in capturing
+small vessels and handling datasets that are partially or ambiguously
+annotated. In this paper, we propose an adaptive semi-supervised approach to
+address these challenges. Our approach incorporates innovative techniques
+including progressive semi-supervised learning, adaptative training strategy,
+and boundary enhancement. Experimental results on 3DRA datasets demonstrate the
+superiority of our method in terms of mesh-based segmentation metrics. By
+leveraging the partially and ambiguously labeled data, which only annotates the
+main vessels, our method achieves impressive segmentation performance on
+mislabeled fine vessels, showcasing its potential for clinical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by DALI MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AvatarVerse: High-quality & Stable 3D Avatar Creation from Text and Pose 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huichao Zhang, Bowen Chen, Hao Yang, Liao Qu, Xu Wang, Li Chen, Chao Long, Feida Zhu, Kang Du, Min Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating expressive, diverse and high-quality 3D avatars from highly
+customized text descriptions and pose guidance is a challenging task, due to
+the intricacy of modeling and texturing in 3D that ensure details and various
+styles (realistic, fictional, etc). We present AvatarVerse, a stable pipeline
+for generating expressive high-quality 3D avatars from nothing but text
+descriptions and pose guidance. In specific, we introduce a 2D diffusion model
+conditioned on DensePose signal to establish 3D pose control of avatars through
+2D images, which enhances view consistency from partially observed scenarios.
+It addresses the infamous Janus Problem and significantly stablizes the
+generation process. Moreover, we propose a progressive high-resolution 3D
+synthesis strategy, which obtains substantial improvement over the quality of
+the created 3D avatars. To this end, the proposed AvatarVerse pipeline achieves
+zero-shot 3D modeling of 3D avatars that are not only more expressive, but also
+in higher quality and fidelity than previous works. Rigorous qualitative
+evaluations and user studies showcase AvatarVerse's superiority in synthesizing
+high-fidelity 3D avatars, leading to a new standard in high-quality and stable
+3D avatar creation. Our project page is: https://avatarverse3d.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent <span class="highlight-title">Self-Supervised</span> Video Denoising with Denser Receptive Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichun Wang, Yulun Zhang, Debing Zhang, Ying Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised video denoising has seen decent progress through the use of
+blind spot networks. However, under their blind spot constraints, previous
+self-supervised video denoising methods suffer from significant information
+loss and texture destruction in either the whole reference frame or neighbor
+frames, due to their inadequate consideration of the receptive field. Moreover,
+the limited number of available neighbor frames in previous methods leads to
+the discarding of distant temporal information. Nonetheless, simply adopting
+existing recurrent frameworks does not work, since they easily break the
+constraints on the receptive field imposed by self-supervision. In this paper,
+we propose RDRF for self-supervised video denoising, which not only fully
+exploits both the reference and neighbor frames with a denser receptive field,
+but also better leverages the temporal information from both local and distant
+neighbor features. First, towards a comprehensive utilization of information
+from both reference and neighbor frames, RDRF realizes a denser receptive field
+by taking more neighbor pixels along the spatial and temporal dimensions.
+Second, it features a self-supervised recurrent video denoising framework,
+which concurrently integrates distant and near-neighbor temporal features. This
+enables long-term bidirectional information aggregation, while mitigating error
+accumulation in the plain recurrent framework. Our method exhibits superior
+performance on both synthetic and real video denoising datasets. Codes will be
+available at https://github.com/Wang-XIaoDingdd/RDRF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FeatEnHancer: Enhancing Hierarchical Features for Object Detection and
+  Beyond Under Low-Light Vision <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khurram Azeem Hashmi, Goutham Kallempudi, Didier Stricker, Muhammamd Zeshan Afzal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting useful visual cues for the downstream tasks is especially
+challenging under low-light vision. Prior works create enhanced representations
+by either correlating visual quality with machine perception or designing
+illumination-degrading transformation methods that require pre-training on
+synthetic datasets. We argue that optimizing enhanced image representation
+pertaining to the loss of the downstream task can result in more expressive
+representations. Therefore, in this work, we propose a novel module,
+FeatEnHancer, that hierarchically combines multiscale features using
+multiheaded attention guided by task-related loss function to create suitable
+representations. Furthermore, our intra-scale enhancement improves the quality
+of features extracted at each scale or level, as well as combines features from
+different scales in a way that reflects their relative importance for the task
+at hand. FeatEnHancer is a general-purpose plug-and-play module and can be
+incorporated into any low-light vision pipeline. We show with extensive
+experimentation that the enhanced representation produced with FeatEnHancer
+significantly and consistently improves results in several low-light vision
+tasks, including dark object detection (+5.7 mAP on ExDark), face detection
+(+1.5 mAPon DARK FACE), nighttime semantic segmentation (+5.1 mIoU on ACDC ),
+and video object detection (+1.8 mAP on DarkVision), highlighting the
+effectiveness of enhancing hierarchical features under low-light vision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 9 Figures, and 10 Tables. Accepted at ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoilNet: An Attention-based Spatio-temporal Deep Learning Framework for
+  Soil Organic Carbon Prediction with Digital Soil Mapping in Europe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nafiseh Kakhani, Moien Rangzan, Ali Jamali, Sara Attarchi, Seyed Kazem Alavipanah, Thomas Scholten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital soil mapping (DSM) is an advanced approach that integrates
+statistical modeling and cutting-edge technologies, including machine learning
+(ML) methods, to accurately depict soil properties and their spatial
+distribution. Soil organic carbon (SOC) is a crucial soil attribute providing
+valuable insights into soil health, nutrient cycling, greenhouse gas emissions,
+and overall ecosystem productivity. This study highlights the significance of
+spatial-temporal deep learning (DL) techniques within the DSM framework. A
+novel architecture is proposed, incorporating spatial information using a base
+convolutional neural network (CNN) model and spatial attention mechanism, along
+with climate temporal information using a long short-term memory (LSTM)
+network, for SOC prediction across Europe. The model utilizes a comprehensive
+set of environmental features, including Landsat-8 images, topography, remote
+sensing indices, and climate time series, as input features. Results
+demonstrate that the proposed framework outperforms conventional ML approaches
+like random forest commonly used in DSM, yielding lower root mean square error
+(RMSE). This model is a robust tool for predicting SOC and could be applied to
+other soil properties, thereby contributing to the advancement of DSM
+techniques and facilitating land management and decision-making processes based
+on accurate information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revealing the Underlying Patterns: Investigating <span class="highlight-title">Dataset</span> Similarity,
+  Performance, and Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshit Achara, Ram Krishna Pandey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised deep learning models require significant amount of labelled data
+to achieve an acceptable performance on a specific task. However, when tested
+on unseen data, the models may not perform well. Therefore, the models need to
+be trained with additional and varying labelled data to improve the
+generalization. In this work, our goal is to understand the models, their
+performance and generalization. We establish image-image, dataset-dataset, and
+image-dataset distances to gain insights into the model's behavior. Our
+proposed distance metric when combined with model performance can help in
+selecting an appropriate model/architecture from a pool of candidate
+architectures. We have shown that the generalization of these models can be
+improved by only adding a small number of unseen images (say 1, 3 or 7) into
+the training set. Our proposed approach reduces training and annotation costs
+while providing an estimate of model performance on unseen data in dynamic
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Decoupling-Recycling Network for Fast Interactive Segmentation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huimin Zeng, Weinong Wang, Xin Tao, Zhiwei Xiong, Yu-Wing Tai, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent interactive segmentation methods iteratively take source image, user
+guidance and previously predicted mask as the input without considering the
+invariant nature of the source image. As a result, extracting features from the
+source image is repeated in each interaction, resulting in substantial
+computational redundancy. In this work, we propose the Feature
+Decoupling-Recycling Network (FDRN), which decouples the modeling components
+based on their intrinsic discrepancies and then recycles components for each
+user interaction. Thus, the efficiency of the whole interactive process can be
+significantly improved. To be specific, we apply the Decoupling-Recycling
+strategy from three perspectives to address three types of discrepancies,
+respectively. First, our model decouples the learning of source image semantics
+from the encoding of user guidance to process two types of input domains
+separately. Second, FDRN decouples high-level and low-level features from
+stratified semantic representations to enhance feature learning. Third, during
+the encoding of user guidance, current user guidance is decoupled from
+historical guidance to highlight the effect of current user guidance. We
+conduct extensive experiments on 6 datasets from different domains and
+modalities, which demonstrate the following merits of our model: 1) superior
+efficiency than other methods, particularly advantageous in challenging
+scenarios requiring long-term interactions (up to 4.25x faster), while
+achieving favorable segmentation performance; 2) strong applicability to
+various methods serving as a universal enhancement technique; 3) well
+cross-task generalizability, e.g., to medical image segmentation, and
+robustness against misleading user guidance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keyword Spotting Simplified: A Segmentation-Free Approach using
+  Character Counting and CTC re-scoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Retsinas, Giorgos Sfikas, Christophoros Nikou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in segmentation-free keyword spotting treat this problem
+w.r.t. an object detection paradigm and borrow from state-of-the-art detection
+systems to simultaneously propose a word bounding box proposal mechanism and
+compute a corresponding representation. Contrary to the norm of such methods
+that rely on complex and large DNN models, we propose a novel segmentation-free
+system that efficiently scans a document image to find rectangular areas that
+include the query information. The underlying model is simple and compact,
+predicting character occurrences over rectangular areas through an implicitly
+learned scale map, trained on word-level annotated images. The proposed
+document scanning is then performed using this character counting in a
+cost-effective manner via integral images and binary search. Finally, the
+retrieval similarity by character counting is refined by a pyramidal
+representation and a CTC-based re-scoring algorithm, fully utilizing the
+trained CNN model. Experimental validation on two widely-used datasets shows
+that our method achieves state-of-the-art results outperforming the more
+complex alternatives, despite the simplicity of the underlying model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balanced Face <span class="highlight-title">Dataset</span>: Guiding StyleGAN to Generate Labeled Synthetic
+  Face Image <span class="highlight-title">Dataset</span> for Underrepresented Group 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a machine learning model to generalize effectively to unseen data within
+a particular problem domain, it is well-understood that the data needs to be of
+sufficient size and representative of real-world scenarios. Nonetheless,
+real-world datasets frequently have overrepresented and underrepresented
+groups. One solution to mitigate bias in machine learning is to leverage a
+diverse and representative dataset. Training a model on a dataset that covers
+all demographics is crucial to reducing bias in machine learning. However,
+collecting and labeling large-scale datasets has been challenging, prompting
+the use of synthetic data generation and active labeling to decrease the costs
+of manual labeling. The focus of this study was to generate a robust face image
+dataset using the StyleGAN model. In order to achieve a balanced distribution
+of the dataset among different demographic groups, a synthetic dataset was
+created by controlling the generation process of StyleGaN and annotated for
+different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures,submitted to AMLD Africa 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Photometric Feature Transform for Free-form Object Scan 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Feng, Kaizhang Kang, Fan Pei, Huakeng Ding, Jinjiang You, Ping Tan, Kun Zhou, Hongzhi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel framework to automatically learn to aggregate and
+transform photometric measurements from multiple unstructured views into
+spatially distinctive and view-invariant low-level features, which are fed to a
+multi-view stereo method to enhance 3D reconstruction. The illumination
+conditions during acquisition and the feature transform are jointly trained on
+a large amount of synthetic data. We further build a system to reconstruct the
+geometry and anisotropic reflectance of a variety of challenging objects from
+hand-held scans. The effectiveness of the system is demonstrated with a
+lightweight prototype, consisting of a camera and an array of LEDs, as well as
+an off-the-shelf tablet. Our results are validated against reconstructions from
+a professional 3D scanner and photographs, and compare favorably with
+state-of-the-art techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Mass Detection in Mammography Images: A Study of Weakly
+  Supervised Learning and Class Activation Map Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vicente Sampaio, Filipe R. Cordeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, weakly supervised models have aided in mass detection using
+mammography images, decreasing the need for pixel-level annotations. However,
+most existing models in the literature rely on Class Activation Maps (CAM) as
+the activation method, overlooking the potential benefits of exploring other
+activation techniques. This work presents a study that explores and compares
+different activation maps in conjunction with state-of-the-art methods for
+weakly supervised training in mammography images. Specifically, we investigate
+CAM, GradCAM, GradCAM++, XGradCAM, and LayerCAM methods within the framework of
+the GMIC model for mass detection in mammography images. The evaluation is
+conducted on the VinDr-Mammo dataset, utilizing the metrics Accuracy, True
+Positive Rate (TPR), False Negative Rate (FNR), and False Positive Per Image
+(FPPI). Results show that using different strategies of activation maps during
+training and test stages leads to an improvement of the model. With this
+strategy, we improve the results of the GMIC method, decreasing the FPPI value
+and increasing TPR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at SIBGRAPI 20203</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Physical World Adversarial Robustness of Vehicle Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Tianyuan Zhang, Shuangcheng Liu, Weiyu Ji, Zichao Zhang, Gang Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks can compromise the robustness of real-world detection
+models. However, evaluating these models under real-world conditions poses
+challenges due to resource-intensive experiments. Virtual simulations offer an
+alternative, but the absence of standardized benchmarks hampers progress.
+Addressing this, we propose an innovative instant-level data generation
+pipeline using the CARLA simulator. Through this pipeline, we establish the
+Discrete and Continuous Instant-level (DCI) dataset, enabling comprehensive
+experiments involving three detection models and three physical adversarial
+attacks. Our findings highlight diverse model performances under adversarial
+conditions. Yolo v6 demonstrates remarkable resilience, experiencing just a
+marginal 6.59% average drop in average precision (AP). In contrast, the ASA
+attack yields a substantial 14.51% average AP reduction, twice the effect of
+other algorithms. We also note that static scenes yield higher recognition AP
+values, and outcomes remain relatively consistent across varying weather
+conditions. Intriguingly, our study suggests that advancements in adversarial
+attack algorithms may be approaching its ``limitation''.In summary, our work
+underscores the significance of adversarial attacks in real-world contexts and
+introduces the DCI dataset as a versatile benchmark. Our findings provide
+valuable insights for enhancing the robustness of detection models and offer
+guidance for future research endeavors in the realm of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepfake Detection: A Comparative Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sohail Ahmed Khan, Duc-Tien Dang-Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper present a comprehensive comparative analysis of supervised and
+self-supervised models for deepfake detection. We evaluate eight supervised
+deep learning architectures and two transformer-based models pre-trained using
+self-supervised strategies (DINO, CLIP) on four benchmarks (FakeAVCeleb,
+CelebDF-V2, DFDC, and FaceForensics++). Our analysis includes intra-dataset and
+inter-dataset evaluations, examining the best performing models, generalisation
+capabilities, and impact of augmentations. We also investigate the trade-off
+between model size and performance. Our main goal is to provide insights into
+the effectiveness of different deep learning architectures (transformers,
+CNNs), training strategies (supervised, self-supervised), and deepfake
+detection benchmarks. These insights can help guide the development of more
+accurate and reliable deepfake detection systems, which are crucial in
+mitigating the harmful impact of deepfakes on individuals and society.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoadScan: A Novel and Robust Transfer Learning Framework for Autonomous
+  Pothole Detection in Roads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03467v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03467v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad Parasnis, Anmol Chokshi, Kailas Devadkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper presents a novel approach to pothole detection using Deep
+Learning and Image Processing techniques. The proposed system leverages the
+VGG16 model for feature extraction and utilizes a custom Siamese network with
+triplet loss, referred to as RoadScan. The system aims to address the critical
+issue of potholes on roads, which pose significant risks to road users.
+Accidents due to potholes on the roads have led to numerous accidents. Although
+it is necessary to completely remove potholes, it is a time-consuming process.
+Hence, a general road user should be able to detect potholes from a safe
+distance in order to avoid damage. Existing methods for pothole detection
+heavily rely on object detection algorithms which tend to have a high chance of
+failure owing to the similarity in structures and textures of a road and a
+pothole. Additionally, these systems utilize millions of parameters thereby
+making the model difficult to use in small-scale applications for the general
+citizen. By analyzing diverse image processing methods and various
+high-performing networks, the proposed model achieves remarkable performance in
+accurately detecting potholes. Evaluation metrics such as accuracy, EER,
+precision, recall, and AUROC validate the effectiveness of the system.
+Additionally, the proposed model demonstrates computational efficiency and
+cost-effectiveness by utilizing fewer parameters and data for training. The
+research highlights the importance of technology in the transportation sector
+and its potential to enhance road safety and convenience. The network proposed
+in this model performs with a 96.12 % accuracy, 3.89 % EER, and a 0.988 AUROC
+value, which is highly competitive with other state-of-the-art works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Silo Prototypical Calibration for Federated Learning with Non-IID
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuang Qi, Lei Meng, Zitan Chen, Han Hu, Hui Lin, Xiangxu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning aims to learn a global model on the server side that
+generalizes to all clients in a privacy-preserving manner, by leveraging the
+local models from different clients. Existing solutions focus on either
+regularizing the objective functions among clients or improving the aggregation
+mechanism for the improved model generalization capability. However, their
+performance is typically limited by the dataset biases, such as the
+heterogeneous data distributions and the missing classes. To address this
+issue, this paper presents a cross-silo prototypical calibration method
+(FedCSPC), which takes additional prototype information from the clients to
+learn a unified feature space on the server side. Specifically, FedCSPC first
+employs the Data Prototypical Modeling (DPM) module to learn data patterns via
+clustering to aid calibration. Subsequently, the cross-silo prototypical
+calibration (CSPC) module develops an augmented contrastive learning method to
+improve the robustness of the calibration, which can effectively project
+cross-source features into a consistent space while maintaining clear decision
+boundaries. Moreover, the CSPC module's ease of implementation and
+plug-and-play characteristics make it even more remarkable. Experiments were
+conducted on four datasets in terms of performance comparison, ablation study,
+in-depth analysis and case study, and the results verified that FedCSPC is
+capable of learning the consistent features across different data sources of
+the same class under the guidance of calibrated model, which leads to better
+performance than the state-of-the-art methods. The source codes have been
+released at https://github.com/qizhuang-qz/FedCSPC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lighting Every Darkness in Two Pairs: A Calibration-Free Pipeline for
+  RAW Denoising <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Jin, Jia-Wen Xiao, Ling-Hao Han, Chunle Guo, Ruixun Zhang, Xialei Liu, Chongyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Calibration-based methods have dominated RAW image denoising under extremely
+low-light environments. However, these methods suffer from several main
+deficiencies: 1) the calibration procedure is laborious and time-consuming, 2)
+denoisers for different cameras are difficult to transfer, and 3) the
+discrepancy between synthetic noise and real noise is enlarged by high digital
+gain. To overcome the above shortcomings, we propose a calibration-free
+pipeline for Lighting Every Drakness (LED), regardless of the digital gain or
+camera sensor. Instead of calibrating the noise parameters and training
+repeatedly, our method could adapt to a target camera only with few-shot paired
+data and fine-tuning. In addition, well-designed structural modification during
+both stages alleviates the domain gap between synthetic and real noise without
+any extra computational cost. With 2 pairs for each additional digital gain (in
+total 6 pairs) and 0.5% iterations, our method achieves superior performance
+over other calibration-based methods. Our code is available at
+https://github.com/Srameo/LED .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaFET: Learning Geometry-aware Facial Expression Translation from
+  In-The-Wild Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxiang Ma, Bingchuan Li, Qian He, Jing Dong, Tieniu Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While current face animation methods can manipulate expressions individually,
+they suffer from several limitations. The expressions manipulated by some
+motion-based facial reenactment models are crude. Other ideas modeled with
+facial action units cannot generalize to arbitrary expressions not covered by
+annotations. In this paper, we introduce a novel Geometry-aware Facial
+Expression Translation (GaFET) framework, which is based on parametric 3D
+facial representations and can stably decoupled expression. Among them, a
+Multi-level Feature Aligned Transformer is proposed to complement non-geometric
+facial detail features while addressing the alignment challenge of spatial
+features. Further, we design a De-expression model based on StyleGAN, in order
+to reduce the learning difficulty of GaFET in unpaired "in-the-wild" images.
+Extensive qualitative and quantitative experiments demonstrate that we achieve
+higher-quality and more accurate facial expression transfer results compared to
+state-of-the-art methods, and demonstrate applicability of various poses and
+complex textures. Besides, videos or annotated training data are omitted,
+making our method easier to use and generalize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Horse with no Labels: <span class="highlight-title">Self-Supervised</span> Horse Pose Estimation from
+  Unlabelled Images and Synthetic Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose Sosa, David Hogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining labelled data to train deep learning methods for estimating animal
+pose is challenging. Recently, synthetic data has been widely used for pose
+estimation tasks, but most methods still rely on supervised learning paradigms
+utilising synthetic images and labels. Can training be fully unsupervised? Is a
+tiny synthetic dataset sufficient? What are the minimum assumptions that we
+could make for estimating animal pose? Our proposal addresses these questions
+through a simple yet effective self-supervised method that only assumes the
+availability of unlabelled images and a small set of synthetic 2D poses. We
+completely remove the need for any 3D or 2D pose annotations (or complex 3D
+animal models), and surprisingly our approach can still learn accurate 3D and
+2D poses simultaneously. We train our method with unlabelled images of horses
+mainly collected for YouTube videos and a prior consisting of 2D synthetic
+poses. The latter is three times smaller than the number of images needed for
+training. We test our method on a challenging set of horse images and evaluate
+the predicted 3D and 2D poses. We demonstrate that it is possible to learn
+accurate animal poses even with as few assumptions as unlabelled images and a
+small set of 2D poses generated from synthetic data. Given the minimum
+requirements and the abundance of unlabelled data, our method could be easily
+deployed to different animals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiT: Efficient Vision <span class="highlight-title">Transformer</span>s with Dynamic Token Routing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Ma, Zhengcong Fei, Junshi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the tokens of images share the same static data flow in many dense
+networks. However, challenges arise from the variance among the objects in
+images, such as large variations in the spatial scale and difficulties of
+recognition for visual entities. In this paper, we propose a data-dependent
+token routing strategy to elaborate the routing paths of image tokens for
+Dynamic Vision Transformer, dubbed DiT. The proposed framework generates a
+data-dependent path per token, adapting to the object scales and visual
+discrimination of tokens. In feed-forward, the differentiable routing gates are
+designed to select the scaling paths and feature transformation paths for image
+tokens, leading to multi-path feature propagation. In this way, the impact of
+object scales and visual discrimination of image representation can be
+carefully tuned. Moreover, the computational cost can be further reduced by
+giving budget constraints to the routing gate and early-stopping of feature
+extraction. In experiments, our DiT achieves superior performance and favorable
+complexity/accuracy trade-offs than many SoTA methods on ImageNet
+classification, object detection, instance segmentation, and semantic
+segmentation. Particularly, the DiT-B5 obtains 84.8\% top-1 Acc on ImageNet
+with 10.3 GFLOPs, which is 1.0\% higher than that of the SoTA method with
+similar computational complexity. These extensive results demonstrate that DiT
+can serve as versatile backbones for various vision tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatially Varying Nanophotonic Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixuan Wei, Xiao Li, Johannes Froech, Praneeth Chakravarthula, James Whitehead, Ethan Tseng, Arka Majumdar, Felix Heide
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explosive growth of computation and energy cost of artificial
+intelligence has spurred strong interests in new computing modalities as
+potential alternatives to conventional electronic processors. Photonic
+processors that execute operations using photons instead of electrons, have
+promised to enable optical neural networks with ultra-low latency and power
+consumption. However, existing optical neural networks, limited by the
+underlying network designs, have achieved image recognition accuracy much lower
+than state-of-the-art electronic neural networks. In this work, we close this
+gap by introducing a large-kernel spatially-varying convolutional neural
+network learned via low-dimensional reparameterization techniques. We
+experimentally instantiate the network with a flat meta-optical system that
+encompasses an array of nanophotonic structures designed to induce
+angle-dependent responses. Combined with an extremely lightweight electronic
+backend with approximately 2K parameters we demonstrate a nanophotonic neural
+network reaches 73.80\% blind test classification accuracy on CIFAR-10 dataset,
+and, as such, the first time, an optical neural network outperforms the first
+modern digital neural network -- AlexNet (72.64\%) with 57M parameters,
+bringing optical neural network into modern deep learning era.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based
+  Residual U-Blocks Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhou Chen, Qian Huang, Yulin Chen, Linyi Qian, Chengyuan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus image segmentation is a crucial step in the analysis, pathological
+diagnosis, and classification, which heavily relies on the quality of nucleus
+segmentation. However, the complexity of issues such as variations in nucleus
+size, blurred nucleus contours, uneven staining, cell clustering, and
+overlapping cells poses significant challenges. Current methods for nucleus
+segmentation primarily rely on nuclear morphology or contour-based approaches.
+Nuclear morphology-based methods exhibit limited generalization ability and
+struggle to effectively predict irregular-shaped nuclei, while contour-based
+extraction methods face challenges in accurately segmenting overlapping nuclei.
+To address the aforementioned issues, we propose a dual-branch network using
+hybrid attention based residual U-blocks for nucleus instance segmentation. The
+network simultaneously predicts target information and target contours.
+Additionally, we introduce a post-processing method that combines the target
+information and target contours to distinguish overlapping nuclei and generate
+an instance segmentation image. Within the network, we propose a context fusion
+block (CF-block) that effectively extracts and merges contextual information
+from the network. Extensive quantitative evaluations are conducted to assess
+the performance of our method. Experimental results demonstrate the superior
+performance of the proposed method compared to state-of-the-art approaches on
+the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Nucleus segmentation, Deep learning, Instance segmentation, Medical
+  imaging, Dual-Branch network</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bilevel Generative Learning for Low-Light Vision <span class="chip">ACM MM'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingchi Liu, Zhu Liu, Long Ma, Jinyuan Liu, Xin Fan, Zhongxuan Luo, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a growing interest in constructing deep learning
+schemes for Low-Light Vision (LLV). Existing techniques primarily focus on
+designing task-specific and data-dependent vision models on the standard RGB
+domain, which inherently contain latent data associations. In this study, we
+propose a generic low-light vision solution by introducing a generative block
+to convert data from the RAW to the RGB domain. This novel approach connects
+diverse vision problems by explicitly depicting data generation, which is the
+first in the field. To precisely characterize the latent correspondence between
+the generative procedure and the vision task, we establish a bilevel model with
+the parameters of the generative block defined as the upper level and the
+parameters of the vision task defined as the lower level. We further develop
+two types of learning strategies targeting different goals, namely low cost and
+high accuracy, to acquire a new bilevel generative learning paradigm. The
+generative blocks embrace a strong generalization ability in other low-light
+vision tasks through the bilevel optimization on enhancement tasks. Extensive
+experimental evaluations on three representative low-light vision tasks, namely
+enhancement, detection, and segmentation, fully demonstrate the superiority of
+our proposed approach. The code will be available at
+https://github.com/Yingchi1998/BGL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM'2023, The code will be available at
+  https://github.com/Yingchi1998/BGL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VR-based body tracking to stimulate musculoskeletal training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Neidhardt, S. Gerlach F. N. Schmidt, I. A. K. Fiedler, S. Grube, B. Busse, A. Schlaefer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training helps to maintain and improve sufficient muscle function, body
+control, and body coordination. These are important to reduce the risk of
+fracture incidents caused by falls, especially for the elderly or people
+recovering from injury. Virtual reality training can offer a cost-effective and
+individualized training experience. We present an application for the HoloLens
+2 to enable musculoskeletal training for elderly and impaired persons to allow
+for autonomous training and automatic progress evaluation. We designed a
+virtual downhill skiing scenario that is controlled by body movement to
+stimulate balance and body control. By adapting the parameters of the ski
+slope, we can tailor the intensity of the training to individual users. In this
+work, we evaluate whether the movement data of the HoloLens 2 alone is
+sufficient to control and predict body movement and joint angles during
+musculoskeletal training. We record the movements of 10 healthy volunteers with
+external tracking cameras and track a set of body and joint angles of the
+participant during training. We estimate correlation coefficients and
+systematically analyze whether whole body movement can be derived from the
+movement data of the HoloLens 2. No participant reports movement sickness
+effects and all were able to quickly interact and control their movement during
+skiing. Our results show a high correlation between HoloLens 2 movement data
+and the external tracking of the upper body movement and joint angles of the
+lower limbs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Forgetting Compensation for Class-Incremental Learning <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahua Dong, Wenqi Liang, Yang Cong, Gan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental learning (CIL) has achieved remarkable successes in
+learning new classes consecutively while overcoming catastrophic forgetting on
+old categories. However, most existing CIL methods unreasonably assume that all
+old categories have the same forgetting pace, and neglect negative influence of
+forgetting heterogeneity among different old classes on forgetting
+compensation. To surmount the above challenges, we develop a novel
+Heterogeneous Forgetting Compensation (HFC) model, which can resolve
+heterogeneous forgetting of easy-to-forget and hard-to-forget old categories
+from both representation and gradient aspects. Specifically, we design a
+task-semantic aggregation block to alleviate heterogeneous forgetting from
+representation aspect. It aggregates local category information within each
+task to learn task-shared global representations. Moreover, we develop two
+novel plug-and-play losses: a gradient-balanced forgetting compensation loss
+and a gradient-balanced relation distillation loss to alleviate forgetting from
+gradient aspect. They consider gradient-balanced compensation to rectify
+forgetting heterogeneity of old categories and heterogeneous relation
+consistency. Experiments on several representative datasets illustrate
+effectiveness of our HFC model. The code is available at
+https://github.com/JiahuaDong/HFC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Aggregation <span class="highlight-title">Transformer</span> for Image Super-Resolution <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chen, Yulun Zhang, Jinjin Gu, Linghe Kong, Xiaokang Yang, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer has recently gained considerable popularity in low-level vision
+tasks, including image super-resolution (SR). These networks utilize
+self-attention along different dimensions, spatial or channel, and achieve
+impressive performance. This inspires us to combine the two dimensions in
+Transformer for a more powerful representation capability. Based on the above
+idea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),
+for image SR. Our DAT aggregates features across spatial and channel
+dimensions, in the inter-block and intra-block dual manner. Specifically, we
+alternately apply spatial and channel self-attention in consecutive Transformer
+blocks. The alternate strategy enables DAT to capture the global context and
+realize inter-block feature aggregation. Furthermore, we propose the adaptive
+interaction module (AIM) and the spatial-gate feed-forward network (SGFN) to
+achieve intra-block feature aggregation. AIM complements two self-attention
+mechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional
+non-linear spatial information in the feed-forward network. Extensive
+experiments show that our DAT surpasses current methods. Code and models are
+obtainable at https://github.com/zhengchen1999/DAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Code is available at
+  https://github.com/zhengchen1999/DAT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distortion-aware <span class="highlight-title">Transformer</span> in 360° Salient Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinjie Zhao, Lichen Zhao, Qian Yu, Jing Zhang, Lu Sheng, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of VR and AR, 360{\deg} data attracts increasing attention
+from the computer vision and multimedia communities. Typically, 360{\deg} data
+is projected into 2D ERP (equirectangular projection) images for feature
+extraction. However, existing methods cannot handle the distortions that result
+from the projection, hindering the development of 360-data-based tasks.
+Therefore, in this paper, we propose a Transformer-based model called DATFormer
+to address the distortion problem. We tackle this issue from two perspectives.
+Firstly, we introduce two distortion-adaptive modules. The first is a
+Distortion Mapping Module, which guides the model to pre-adapt to distorted
+features globally. The second module is a Distortion-Adaptive Attention Block
+that reduces local distortions on multi-scale features. Secondly, to exploit
+the unique characteristics of 360{\deg} data, we present a learnable relation
+matrix and use it as part of the positional embedding to further improve
+performance. Extensive experiments are conducted on three public datasets, and
+the results show that our model outperforms existing 2D SOD (salient object
+detection) and 360 SOD methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-Guided Diffusion Model for CBCT-to-CT Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linjie Fu, Xia Li, Xiuding Cai, Dong Miao, Yu Yao, Yali Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cone Beam CT (CBCT) plays a crucial role in Adaptive Radiation Therapy (ART)
+by accurately providing radiation treatment when organ anatomy changes occur.
+However, CBCT images suffer from scatter noise and artifacts, making relying
+solely on CBCT for precise dose calculation and accurate tissue localization
+challenging. Therefore, there is a need to improve CBCT image quality and
+Hounsfield Unit (HU) accuracy while preserving anatomical structures. To
+enhance the role and application value of CBCT in ART, we propose an
+energy-guided diffusion model (EGDiff) and conduct experiments on a chest tumor
+dataset to generate synthetic CT (sCT) from CBCT. The experimental results
+demonstrate impressive performance with an average absolute error of
+26.87$\pm$6.14 HU, a structural similarity index measurement of 0.850$\pm$0.03,
+a peak signal-to-noise ratio of the sCT of 19.83$\pm$1.39 dB, and a normalized
+cross-correlation of the sCT of 0.874$\pm$0.04. These results indicate that our
+method outperforms state-of-the-art unsupervised synthesis methods in accuracy
+and visual quality, producing superior sCT images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SciGraphQA: A Large-Scale Synthetic Multi-Turn Question-Answering
+  <span class="highlight-title">Dataset</span> for Scientific Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengzhi Li, Nima Tajbakhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present SciGraphQA, a synthetic multi-turn question-answer
+dataset related to academic graphs. SciGraphQA is 13 times larger than
+ChartVQA, the previously largest chart-visual question-answering dataset. It is
+also the largest open-sourced chart VQA dataset with non-synthetic charts. To
+build our dataset, we selected 290,000 Computer Science or Machine Learning
+ArXiv papers published between 2010 and 2020, and then used Palm-2 to generate
+295K samples of open-vocabulary multi-turn question-answering dialogues about
+the graphs. As context, we provided the text-only Palm-2 with paper title,
+abstract, paragraph mentioning the graph, and rich text contextual data from
+the graph itself, obtaining dialogues with an average 2.23 question-answer
+turns for each graph. We asked GPT-4 to assess the matching quality of our
+question-answer turns given the paper's context, obtaining an average rating of
+8.7/10 on our 3K test set. We evaluated the 0-shot capability of the most
+popular MLLM models such as LLaVa, mPLUGowl, BLIP-2, and openFlamingo's on our
+dataset, finding LLaVA-13B being the most performant with a CIDEr score of
+0.08. We further enriched the question prompts for LLAVA by including the
+serialized data tables extracted from the graphs using the DePlot model,
+boosting LLaVA's 0-shot CIDEr to 0.15. To verify the validity of our dataset,
+we also fine-tuned LLaVa using our dataset, reaching a substantially higher
+CIDEr score of 0.26. We anticipate further accuracy improvement by including
+segmentation mask tokens and leveraging larger LLM backbones coupled with
+emergent prompting techniques. Our code and data are open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative Colorization: Exploring Latent Cross-Domain Priors for NIR
+  Image Spectrum Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxing Yang, Jie Chen, Zaifeng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Near-infrared (NIR) image spectrum translation is a challenging problem with
+many promising applications. Existing methods struggle with the mapping
+ambiguity between the NIR and the RGB domains, and generalize poorly due to the
+limitations of models' learning capabilities and the unavailability of
+sufficient NIR-RGB image pairs for training. To address these challenges, we
+propose a cooperative learning paradigm that colorizes NIR images in parallel
+with another proxy grayscale colorization task by exploring latent cross-domain
+priors (i.e., latent spectrum context priors and task domain priors), dubbed
+CoColor. The complementary statistical and semantic spectrum information from
+these two task domains -- in the forms of pre-trained colorization networks --
+are brought in as task domain priors. A bilateral domain translation module is
+subsequently designed, in which intermittent NIR images are generated from
+grayscale and colorized in parallel with authentic NIR images; and vice versa
+for the grayscale images. These intermittent transformations act as latent
+spectrum context priors for efficient domain knowledge exchange. We
+progressively fine-tune and fuse these modules with a series of pixel-level and
+feature-level consistency constraints. Experiments show that our proposed
+cooperative learning framework produces satisfactory spectrum translation
+outputs with diverse colors and rich textures, and outperforms state-of-the-art
+counterparts by 3.95dB and 4.66dB in terms of PNSR for the NIR and grayscale
+colorization tasks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid CNN-<span class="highlight-title">Transformer</span> Architecture with Frequency Domain Contrastive
+  Learning for Image Deraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Wang, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image deraining is a challenging task that involves restoring degraded images
+affected by rain streaks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Part-Aware <span class="highlight-title">Transformer</span> for Generalizable Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Ni, Yuke Li, Heng Tao Shen, Jingkuan Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain generalization person re-identification (DG-ReID) aims to train a
+model on source domains and generalize well on unseen domains. Vision
+Transformer usually yields better generalization ability than common CNN
+networks under distribution shifts. However, Transformer-based ReID models
+inevitably over-fit to domain-specific biases due to the supervised learning
+strategy on the source domain. We observe that while the global images of
+different IDs should have different features, their similar local parts (e.g.,
+black backpack) are not bounded by this constraint. Motivated by this, we
+propose a pure Transformer model (termed Part-aware Transformer) for DG-ReID by
+designing a proxy task, named Cross-ID Similarity Learning (CSL), to mine local
+visual information shared by different IDs. This proxy task allows the model to
+learn generic features because it only cares about the visual similarity of the
+parts regardless of the ID labels, thus alleviating the side effect of
+domain-specific biases. Based on the local similarity obtained in CSL, a
+Part-guided Self-Distillation (PSD) is proposed to further improve the
+generalization of global features. Our method achieves state-of-the-art
+performance under most DG ReID settings. Under the Market$\to$Duke setting, our
+method exceeds state-of-the-art by 10.9% and 12.8% in Rank1 and mAP,
+respectively. The code is available at
+https://github.com/liyuke65535/Part-Aware-Transformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Zhou, Huanran Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning is inseparable from normalization layers.
+Researchers have proposed various normalization functions, and each of them has
+both advantages and disadvantages. In response, efforts have been made to
+design a unified normalization function that combines all normalization
+procedures and mitigates their weaknesses. We also proposed a new normalization
+function called Adaptive Fusion Normalization. Through experiments, we
+demonstrate AFN outperforms the previous normalization techniques in domain
+generalization and image classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2106.01899 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Dotzel, Gang Wu, Andrew Li, Muhammad Umar, Yun Ni, Mohamed S. Abdelfattah, Zhiru Zhang, Liqun Cheng, Martin G. Dixon, Norman P. Jouppi, Quoc V. Le, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has become a mainstream compression technique for reducing model
+size, computational requirements, and energy consumption for modern deep neural
+networks (DNNs). With the improved numerical support in recent hardware,
+including multiple variants of integer and floating point, mixed-precision
+quantization has become necessary to achieve high-quality results with low
+model cost. Prior mixed-precision quantization methods have performed a
+post-training quantization search, which compromises on accuracy, or a
+differentiable quantization search, which leads to high memory usage from
+branching. Therefore, we propose the first one-shot mixed-precision
+quantization search that eliminates the need for retraining in both integer and
+low-precision floating point models. We evaluate our floating-point and integer
+quantization search (FLIQS) on multiple convolutional networks and vision
+transformer models to discover Pareto-optimal models. Our approach discovers
+models that improve upon uniform precision, manual mixed-precision, and recent
+integer quantization search methods. With the proposed integer quantization
+search, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and
+ResNet-50 by 0.90% points with equivalent model cost over previous methods.
+Additionally, for the first time, we explore a novel mixed-precision
+floating-point search and improve MobileNetV2 by up to 0.98% points compared to
+prior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously
+search a joint quantization and neural architecture space and improve the
+ImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2
+search space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Label <span class="highlight-title">Self-Supervised</span> Learning with Scene Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Zhu, Minghao Fu, Jianxin Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) methods targeting scene images have seen a
+rapid growth recently, and they mostly rely on either a dedicated dense
+matching mechanism or a costly unsupervised object discovery module. This paper
+shows that instead of hinging on these strenuous operations, quality image
+representations can be learned by treating scene/multi-label image SSL simply
+as a multi-label classification problem, which greatly simplifies the learning
+framework. Specifically, multiple binary pseudo-labels are assigned for each
+input image by comparing its embeddings with those in two dictionaries, and the
+network is optimized using the binary cross entropy loss. The proposed method
+is named Multi-Label Self-supervised learning (MLS). Visualizations
+qualitatively show that clearly the pseudo-labels by MLS can automatically find
+semantically similar pseudo-positive pairs across different images to
+facilitate contrastive learning. MLS learns high quality representations on
+MS-COCO and achieves state-of-the-art results on classification, detection and
+segmentation benchmarks. At the same time, MLS is much simpler than existing
+methods, making it easier to deploy and for further exploration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Environment-Invariant Curriculum Relation Learning for Fine-Grained
+  Scene Graph Generation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukuan Min, Aming Wu, Cheng Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scene graph generation (SGG) task is designed to identify the predicates
+based on the subject-object pairs.However,existing datasets generally include
+two imbalance cases: one is the class imbalance from the predicted predicates
+and another is the context imbalance from the given subject-object pairs, which
+presents significant challenges for SGG. Most existing methods focus on the
+imbalance of the predicted predicate while ignoring the imbalance of the
+subject-object pairs, which could not achieve satisfactory results. To address
+the two imbalance cases, we propose a novel Environment Invariant Curriculum
+Relation learning (EICR) method, which can be applied in a plug-and-play
+fashion to existing SGG methods. Concretely, to remove the imbalance of the
+subject-object pairs, we first construct different distribution environments
+for the subject-object pairs and learn a model invariant to the environment
+changes. Then, we construct a class-balanced curriculum learning strategy to
+balance the different environments to remove the predicate imbalance.
+Comprehensive experiments conducted on VG and GQA datasets demonstrate that our
+EICR framework can be taken as a general strategy for various SGG models, and
+achieve significant improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023. arXiv admin note: text overlap with arXiv:2203.11654 by
+  other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with
+  Whitted-Style Ray Tracing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyi Zeng, Chong Bao, Rui Chen, Zilong Dong, Guofeng Zhang, Hujun Bao, Zhaopeng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Neural Radiance Fields (NeRF) has exhibited significant success in
+novel view synthesis, surface reconstruction, etc. However, since no physical
+reflection is considered in its rendering pipeline, NeRF mistakes the
+reflection in the mirror as a separate virtual scene, leading to the inaccurate
+reconstruction of the mirror and multi-view inconsistent reflections in the
+mirror. In this paper, we present a novel neural rendering framework, named
+Mirror-NeRF, which is able to learn accurate geometry and reflection of the
+mirror and support various scene manipulation applications with mirrors, such
+as adding new objects or mirrors into the scene and synthesizing the
+reflections of these new objects in mirrors, controlling mirror roughness, etc.
+To achieve this goal, we propose a unified radiance field by introducing the
+reflection probability and tracing rays following the light transport model of
+Whitted Ray Tracing, and also develop several techniques to facilitate the
+learning process. Experiments and comparisons on both synthetic and real
+datasets demonstrate the superiority of our method. The code and supplementary
+material are available on the project webpage:
+https://zju3dv.github.io/Mirror-NeRF/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023. Project Page:
+  https://zju3dv.github.io/Mirror-NeRF/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatialyze: A Geospatial Video Analytics System with Spatial-Aware
+  Optimizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanwut Kittivorawong, Yongming Ge, Yousef Helal, Alvin Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Videos that are shot using commodity hardware such as phones and surveillance
+cameras record various metadata such as time and location. We encounter such
+geospatial videos on a daily basis and such videos have been growing in volume
+significantly. Yet, we do not have data management systems that allow users to
+interact with such data effectively.
+  In this paper, we describe Spatialyze, a new framework for end-to-end
+querying of geospatial videos. Spatialyze comes with a domain-specific language
+where users can construct geospatial video analytic workflows using a 3-step,
+declarative, build-filter-observe paradigm. Internally, Spatialyze leverages
+the declarative nature of such workflows, the temporal-spatial metadata stored
+with videos, and physical behavior of real-world objects to optimize the
+execution of workflows. Our results using real-world videos and workflows show
+that Spatialyze can reduce execution time by up to 5.3x, while maintaining up
+to 97.1% accuracy compared to unoptimized execution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature-Suppressed Contrast for <span class="highlight-title">Self-Supervised</span> Food <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinda Liu, Yaohui Zhu, Linhu Liu, Jiang Tian, Lili Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most previous approaches for analyzing food images have relied on extensively
+annotated datasets, resulting in significant human labeling expenses due to the
+varied and intricate nature of such images. Inspired by the effectiveness of
+contrastive self-supervised methods in utilizing unlabelled data, weiqing
+explore leveraging these techniques on unlabelled food images. In contrastive
+self-supervised methods, two views are randomly generated from an image by data
+augmentations. However, regarding food images, the two views tend to contain
+similar informative contents, causing large mutual information, which impedes
+the efficacy of contrastive self-supervised learning. To address this problem,
+we propose Feature Suppressed Contrast (FeaSC) to reduce mutual information
+between views. As the similar contents of the two views are salient or highly
+responsive in the feature map, the proposed FeaSC uses a response-aware scheme
+to localize salient features in an unsupervised manner. By suppressing some
+salient features in one view while leaving another contrast view unchanged, the
+mutual information between the two views is reduced, thereby enhancing the
+effectiveness of contrast learning for self-supervised food pre-training. As a
+plug-and-play module, the proposed method consistently improves BYOL and
+SimSiam by 1.70\% $\sim$ 6.69\% classification accuracy on four publicly
+available food recognition datasets. Superior results have also been achieved
+on downstream segmentation tasks, demonstrating the effectiveness of the
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redundancy-aware <span class="highlight-title">Transformer</span> for Video Question Answering <span class="chip">ACM MM23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicong Li, Xun Yang, An Zhang, Chun Feng, Xiang Wang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper identifies two kinds of redundancy in the current VideoQA
+paradigm. Specifically, the current video encoders tend to holistically embed
+all video clues at different granularities in a hierarchical manner, which
+inevitably introduces \textit{neighboring-frame redundancy} that can overwhelm
+detailed visual clues at the object level. Subsequently, prevailing
+vision-language fusion designs introduce the \textit{cross-modal redundancy} by
+exhaustively fusing all visual elements with question tokens without explicitly
+differentiating their pairwise vision-language interactions, thus making a
+pernicious impact on the answering.
+  To this end, we propose a novel transformer-based architecture, that aims to
+model VideoQA in a redundancy-aware manner. To address the neighboring-frame
+redundancy, we introduce a video encoder structure that emphasizes the
+object-level change in neighboring frames, while adopting an out-of-neighboring
+message-passing scheme that imposes attention only on distant frames. As for
+the cross-modal redundancy, we equip our fusion module with a novel adaptive
+sampling, which explicitly differentiates the vision-language interactions by
+identifying a small subset of visual elements that exclusively support the
+answer. Upon these advancements, we find this
+\underline{R}edundancy-\underline{a}ware trans\underline{former} (RaFormer) can
+achieve state-of-the-art results on multiple VideoQA benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Benchmark for Chinese-English Scene Text Image Super-resolution <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianqi Ma, Zhetong Liang, Wangmeng Xiang, Xi Yang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Text Image Super-resolution (STISR) aims to recover high-resolution
+(HR) scene text images with visually pleasant and readable text content from
+the given low-resolution (LR) input. Most existing works focus on recovering
+English texts, which have relatively simple character structures, while little
+work has been done on the more challenging Chinese texts with diverse and
+complex character structures. In this paper, we propose a real-world
+Chinese-English benchmark dataset, namely Real-CE, for the task of STISR with
+the emphasis on restoring structurally complex Chinese characters. The
+benchmark provides 1,935/783 real-world LR-HR text image pairs~(contains 33,789
+text lines in total) for training/testing in 2$\times$ and 4$\times$ zooming
+modes, complemented by detailed annotations, including detection boxes and text
+transcripts. Moreover, we design an edge-aware learning method, which provides
+structural supervision in image and feature domains, to effectively reconstruct
+the dense structures of Chinese characters. We conduct experiments on the
+proposed Real-CE benchmark and evaluate the existing STISR models with and
+without our edge-aware loss. The benchmark, including data and source code, is
+available at https://github.com/mjq11302010044/Real-CE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ APBench: A Unified Benchmark for Availability Poisoning Attacks and
+  Defenses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianrui Qin, Xitong Gao, Juanjuan Zhao, Kejiang Ye, Cheng-Zhong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficacy of availability poisoning, a method of poisoning data by
+injecting imperceptible perturbations to prevent its use in model training, has
+been a hot subject of investigation. Previous research suggested that it was
+difficult to effectively counteract such poisoning attacks. However, the
+introduction of various defense methods has challenged this notion. Due to the
+rapid progress in this field, the performance of different novel methods cannot
+be accurately validated due to variations in experimental setups. To further
+evaluate the attack and defense capabilities of these poisoning methods, we
+have developed a benchmark -- APBench for assessing the efficacy of adversarial
+poisoning. APBench consists of 9 state-of-the-art availability poisoning
+attacks, 8 defense algorithms, and 4 conventional data augmentation techniques.
+We also have set up experiments with varying different poisoning ratios, and
+evaluated the attacks on multiple datasets and their transferability across
+model architectures. We further conducted a comprehensive evaluation of 2
+additional attacks specifically targeting unsupervised models. Our results
+reveal the glaring inadequacy of existing attacks in safeguarding individual
+privacy. APBench is open source and available to the deep learning community:
+https://github.com/lafeat/apbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning a Graph Neural Network with Cross Modality Interaction for
+  Image Fusion <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Li, Jiansheng Chen, Jinyuan Liu, Huimin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared and visible image fusion has gradually proved to be a vital fork in
+the field of multi-modality imaging technologies. In recent developments,
+researchers not only focus on the quality of fused images but also evaluate
+their performance in downstream tasks. Nevertheless, the majority of methods
+seldom put their eyes on the mutual learning from different modalities,
+resulting in fused images lacking significant details and textures. To overcome
+this issue, we propose an interactive graph neural network (GNN)-based
+architecture between cross modality for fusion, called IGNet. Specifically, we
+first apply a multi-scale extractor to achieve shallow features, which are
+employed as the necessary input to build graph structures. Then, the graph
+interaction module can construct the extracted intermediate features of the
+infrared/visible branch into graph structures. Meanwhile, the graph structures
+of two branches interact for cross-modality and semantic learning, so that
+fused images can maintain the important feature expressions and enhance the
+performance of downstream tasks. Besides, the proposed leader nodes can improve
+information propagation in the same modality. Finally, we merge all graph
+features to get the fusion result. Extensive experiments on different datasets
+(TNO, MFNet and M3FD) demonstrate that our IGNet can generate visually
+appealing fused images while scoring averagely 2.59% mAP@.5 and 7.77% mIoU
+higher in detection and segmentation than the compared state-of-the-art
+methods. The source code of the proposed IGNet can be available at
+https://github.com/lok-18/IGNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 10 figures, ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mind the Gap: Improving Success Rate of Vision-and-Language Navigation
+  by Revisiting Oracle Success Routes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongyang Zhao, Yuankai Qi, Qi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-Language Navigation (VLN) aims to navigate to the target location
+by following a given instruction. Unlike existing methods focused on predicting
+a more accurate action at each step in navigation, in this paper, we make the
+first attempt to tackle a long-ignored problem in VLN: narrowing the gap
+between Success Rate (SR) and Oracle Success Rate (OSR). We observe a
+consistently large gap (up to 9%) on four state-of-the-art VLN methods across
+two benchmark datasets: R2R and REVERIE. The high OSR indicates the robot agent
+passes the target location, while the low SR suggests the agent actually fails
+to stop at the target location at last. Instead of predicting actions directly,
+we propose to mine the target location from a trajectory given by off-the-shelf
+VLN models. Specially, we design a multi-module transformer-based model for
+learning compact discriminative trajectory viewpoint representation, which is
+used to predict the confidence of being a target location as described in the
+instruction. The proposed method is evaluated on three widely-adopted datasets:
+R2R, REVERIE and NDH, and shows promising results, demonstrating the potential
+for more future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Skeleton-based Action Recognition via Mutual Information
+  Estimation and Maximization <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Zhou, Wenwen Qiang, Anyi Rao, Ning Lin, Bing Su, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot skeleton-based action recognition aims to recognize actions of
+unseen categories after training on data of seen categories. The key is to
+build the connection between visual and semantic space from seen to unseen
+classes. Previous studies have primarily focused on encoding sequences into a
+singular feature vector, with subsequent mapping the features to an identical
+anchor point within the embedded space. Their performance is hindered by 1) the
+ignorance of the global visual/semantic distribution alignment, which results
+in a limitation to capture the true interdependence between the two spaces. 2)
+the negligence of temporal information since the frame-wise features with rich
+action clues are directly pooled into a single feature vector. We propose a new
+zero-shot skeleton-based action recognition method via mutual information (MI)
+estimation and maximization. Specifically, 1) we maximize the MI between visual
+and semantic space for distribution alignment; 2) we leverage the temporal
+information for estimating the MI by encouraging MI to increase as more frames
+are observed. Extensive experiments on three large-scale skeleton action
+datasets confirm the effectiveness of our method. Code:
+https://github.com/YujieOuO/SMIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deterministic Neural Illumination Mapping for Efficient Auto-White
+  Balance Correction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Furkan Kınlı, Doğa Yılmaz, Barış Özcan, Furkan Kıraç
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Auto-white balance (AWB) correction is a critical operation in image signal
+processors for accurate and consistent color correction across various
+illumination scenarios. This paper presents a novel and efficient AWB
+correction method that achieves at least 35 times faster processing with
+equivalent or superior performance on high-resolution images for the current
+state-of-the-art methods. Inspired by deterministic color style transfer, our
+approach introduces deterministic illumination color mapping, leveraging
+learnable projection matrices for both canonical illumination form and
+AWB-corrected output. It involves feeding high-resolution images and
+corresponding latent representations into a mapping module to derive a
+canonical form, followed by another mapping module that maps the pixel values
+to those for the corrected version. This strategy is designed as
+resolution-agnostic and also enables seamless integration of any pre-trained
+AWB network as the backbone. Experimental results confirm the effectiveness of
+our approach, revealing significant performance improvements and reduced time
+complexity compared to state-of-the-art methods. Our method provides an
+efficient deep learning-based AWB correction solution, promising real-time,
+high-quality color correction for digital imaging applications. Source code is
+available at https://github.com/birdortyedi/DeNIM/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, ICCV 2023 Workshops (RCV 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ALFA -- Leveraging All Levels of Feature Abstraction for Enhancing the
+  Generalization of Histopathology Image Classification Across Unseen Hospitals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Milad Sikaroudi, Shahryar Rahnamayan, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an exhaustive methodology that leverages all levels of feature
+abstraction, targeting an enhancement in the generalizability of image
+classification to unobserved hospitals. Our approach incorporates
+augmentation-based self-supervision with common distribution shifts in
+histopathology scenarios serving as the pretext task. This enables us to derive
+invariant features from training images without relying on training labels,
+thereby covering different abstraction levels. Moving onto the subsequent
+abstraction level, we employ a domain alignment module to facilitate further
+extraction of invariant features across varying training hospitals. To
+represent the highly specific features of participating hospitals, an encoder
+is trained to classify hospital labels, independent of their diagnostic labels.
+The features from each of these encoders are subsequently disentangled to
+minimize redundancy and segregate the features. This representation, which
+spans a broad spectrum of semantic information, enables the development of a
+model demonstrating increased robustness to unseen images from disparate
+distributions. Experimental results from the PACS dataset (a domain
+generalization benchmark), a synthetic dataset created by applying
+histopathology-specific jitters to the MHIST dataset (defining different
+domains with varied distribution shifts), and a Renal Cell Carcinoma dataset
+derived from four image repositories from TCGA, collectively indicate that our
+proposed model is adept at managing varying levels of image granularity. Thus,
+it shows improved generalizability when faced with new, out-of-distribution
+hospital images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings
+  for Video Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumyabrata Chaudhuri, Saumik Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Action Recognition (VAR) is a challenging task due to its inherent
+complexities. Though different approaches have been explored in the literature,
+designing a unified framework to recognize a large number of human actions is
+still a challenging problem. Recently, Multi-Modal Learning (MML) has
+demonstrated promising results in this domain. In literature, 2D skeleton or
+pose modality has often been used for this task, either independently or in
+conjunction with the visual information (RGB modality) present in videos.
+However, the combination of pose, visual information, and text attributes has
+not been explored yet, though text and pose attributes independently have been
+proven to be effective in numerous computer vision tasks. In this paper, we
+present the first pose augmented Vision-language model (VLM) for VAR. Notably,
+our scheme achieves an accuracy of 92.81% and 73.02% on two popular human video
+action recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even
+without any video data pre-training, and an accuracy of 96.11% and 75.75% after
+kinetics pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIJO: Trigger Inversion with Joint Optimization for Defending Multimodal
+  Backdoored Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Indranil Sur, Karan Sikka, Matthew Walmer, Kaushik Koneripalli, Anirban Roy, Xiao Lin, Ajay Divakaran, Susmit Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a Multimodal Backdoor Defense technique TIJO (Trigger Inversion
+using Joint Optimization). Recent work arXiv:2112.07668 has demonstrated
+successful backdoor attacks on multimodal models for the Visual Question
+Answering task. Their dual-key backdoor trigger is split across two modalities
+(image and text), such that the backdoor is activated if and only if the
+trigger is present in both modalities. We propose TIJO that defends against
+dual-key attacks through a joint optimization that reverse-engineers the
+trigger in both the image and text modalities. This joint optimization is
+challenging in multimodal models due to the disconnected nature of the visual
+pipeline which consists of an offline feature extractor, whose output is then
+fused with the text using a fusion module. The key insight enabling the joint
+optimization in TIJO is that the trigger inversion needs to be carried out in
+the object detection box feature space as opposed to the pixel space. We
+demonstrate the effectiveness of our method on the TrojVQA benchmark, where
+TIJO improves upon the state-of-the-art unimodal methods from an AUC of 0.6 to
+0.92 on multimodal dual-key backdoors. Furthermore, our method also improves
+upon the unimodal baselines on unimodal backdoors. We present ablation studies
+and qualitative results to provide insights into our algorithm such as the
+critical importance of overlaying the inverted feature triggers on all visual
+features during trigger inversion. The prototype implementation of TIJO is
+available at https://github.com/SRI-CSL/TIJO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as conference paper at ICCV 2023. 13 pages, 6 figures, 7
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developability Approximation for Neural Implicits through Rank
+  Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratheba Selvaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developability refers to the process of creating a surface without any
+tearing or shearing from a two-dimensional plane. It finds practical
+applications in the fabrication industry. An essential characteristic of a
+developable 3D surface is its zero Gaussian curvature, which means that either
+one or both of the principal curvatures are zero. This paper introduces a
+method for reconstructing an approximate developable surface from a neural
+implicit surface. The central idea of our method involves incorporating a
+regularization term that operates on the second-order derivatives of the neural
+implicits, effectively promoting zero Gaussian curvature. Implicit surfaces
+offer the advantage of smoother deformation with infinite resolution,
+overcoming the high polygonal constraints of state-of-the-art methods using
+discrete representations. We draw inspiration from the properties of surface
+curvature and employ rank minimization techniques derived from compressed
+sensing. Experimental results on both developable and non-developable surfaces,
+including those affected by noise, validate the generalizability of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Sky to the Ground: A Large-scale Benchmark and Simple Baseline
+  Towards Real Rain Removal <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Guo, Xueyao Xiao, Yi Chang, Shumin Deng, Luxin Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based image deraining methods have made great progress. However, the
+lack of large-scale high-quality paired training samples is the main bottleneck
+to hamper the real image deraining (RID). To address this dilemma and advance
+RID, we construct a Large-scale High-quality Paired real rain benchmark
+(LHP-Rain), including 3000 video sequences with 1 million high-resolution
+(1920*1080) frame pairs. The advantages of the proposed dataset over the
+existing ones are three-fold: rain with higher-diversity and larger-scale,
+image with higher-resolution and higher-quality ground-truth. Specifically, the
+real rains in LHP-Rain not only contain the classical rain
+streak/veiling/occlusion in the sky, but also the \textbf{splashing on the
+ground} overlooked by deraining community. Moreover, we propose a novel robust
+low-rank tensor recovery model to generate the GT with better separating the
+static background from the dynamic rain. In addition, we design a simple
+transformer-based single image deraining baseline, which simultaneously utilize
+the self-attention and cross-layer attention within the image and rain layer
+with discriminative feature representation. Extensive experiments verify the
+superiority of the proposed dataset and deraining method over state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DefCor-Net: Physics-Aware Ultrasound Deformation Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongliang Jiang, Yue Zhou, Dongliang Cao, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recovery of morphologically accurate anatomical images from deformed ones
+is challenging in ultrasound (US) image acquisition, but crucial to accurate
+and consistent diagnosis, particularly in the emerging field of
+computer-assisted diagnosis. This article presents a novel anatomy-aware
+deformation correction approach based on a coarse-to-fine, multi-scale deep
+neural network (DefCor-Net). To achieve pixel-wise performance, DefCor-Net
+incorporates biomedical knowledge by estimating pixel-wise stiffness online
+using a U-shaped feature extractor. The deformation field is then computed
+using polynomial regression by integrating the measured force applied by the US
+probe. Based on real-time estimation of pixel-by-pixel tissue properties, the
+learning-based approach enables the potential for anatomy-aware deformation
+correction. To demonstrate the effectiveness of the proposed DefCor-Net, images
+recorded at multiple locations on forearms and upper arms of six volunteers are
+used to train and validate DefCor-Net. The results demonstrate that DefCor-Net
+can significantly improve the accuracy of deformation correction to recover the
+original geometry (Dice Coefficient: from $14.3\pm20.9$ to $82.6\pm12.1$ when
+the force is $6N$).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MedIA. code is available</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Throughput and Accurate 3D Scanning of Cattle Using Time-of-Flight
+  Sensors and Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gbenga Omotara, Seyed Mohamad Ali Tousi, Jared Decker, Derek Brake, Guilherme N. DeSouza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a high throughput 3D scanning solution specifically designed to
+precisely measure cattle phenotypes. This scanner leverages an array of depth
+sensors, i.e. time-of-flight (Tof) sensors, each governed by dedicated embedded
+devices. The system excels at generating high-fidelity 3D point clouds, thus
+facilitating an accurate mesh that faithfully reconstructs the cattle geometry
+on the fly. In order to evaluate the performance of our system, we have
+implemented a two-fold validation process. Initially, we test the scanner's
+competency in determining volume and surface area measurements within a
+controlled environment featuring known objects. Secondly, we explore the impact
+and necessity of multi-device synchronization when operating a series of
+time-of-flight sensors. Based on the experimental results, the proposed system
+is capable of producing high-quality meshes of untamed cattle for livestock
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent Multi-scale <span class="highlight-title">Transformer</span> for High-Resolution Salient Object
+  Detection <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Deng, Pingping Zhang, Wei Liu, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient Object Detection (SOD) aims to identify and segment the most
+conspicuous objects in an image or video. As an important pre-processing step,
+it has many potential applications in multimedia and vision tasks. With the
+advance of imaging devices, SOD with high-resolution images is of great demand,
+recently. However, traditional SOD methods are largely limited to
+low-resolution images, making them difficult to adapt to the development of
+High-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no
+large enough datasets for training and evaluating. Besides, current HRSOD
+methods generally produce incomplete object regions and irregular object
+boundaries. To address above issues, in this work, we first propose a new
+HRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K
+resolution. As far as we know, it is the largest dataset for the HRSOD task,
+which will significantly help future works in training and evaluating models.
+Furthermore, to improve the HRSOD performance, we propose a novel Recurrent
+Multi-scale Transformer (RMFormer), which recurrently utilizes shared
+Transformers and multi-scale refinement architectures. Thus, high-resolution
+saliency maps can be generated with the guidance of lower-resolution
+predictions. Extensive experiments on both high-resolution and low-resolution
+benchmarks show the effectiveness and superiority of the proposed framework.
+The source code and dataset are released at:
+https://github.com/DrowsyMon/RMFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ACM MM2023. More modifications may be
+  performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Classification on a Data Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Feuer, Ameya Joshi, Minh Pham, Chinmay Hegde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real world uses of deep learning require predictable model behavior under
+distribution shifts. Models such as CLIP show emergent natural distributional
+robustness comparable to humans, but may require hundreds of millions of
+training samples. Can we train robust learners in a domain where data is
+limited? To rigorously address this question, we introduce JANuS (Joint
+Annotations and Names Set), a collection of four new training datasets with
+images, labels, and corresponding captions, and perform a series of carefully
+controlled investigations of factors contributing to robustness in image
+classification, then compare those results to findings derived from a
+large-scale meta-analysis. Using this approach, we show that standard ResNet-50
+trained with the cross-entropy loss on 2.4 million image samples can attain
+comparable robustness to a CLIP ResNet-50 trained on 400 million samples. To
+our knowledge, this is the first result showing (near) state-of-the-art
+distributional robustness on limited data budgets. Our dataset is available at
+\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used
+to reproduce our experiments can be found at
+\url{https://github.com/penfever/vlhub/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR 2023; openreview link:
+  https://openreview.net/forum?id=D5Z2E8CNsD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CUTS: A Fully Unsupervised Framework for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11359v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11359v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Matthew Amodio, Liangbo L. Shen, Feng Gao, Arman Avesta, Sanjay Aneja, Jay C. Wang, Lucian V. Del Priore, Smita Krishnaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we introduce CUTS (Contrastive and Unsupervised Training for
+Segmentation), a fully unsupervised deep learning framework for medical image
+segmentation to better utilize the vast majority of imaging data that is not
+labeled or annotated. We utilize self-supervision from pixels and their local
+neighborhoods in the images themselves. Our unsupervised approach optimizes a
+training objective that leverages concepts from contrastive learning and
+autoencoding. Our framework segments medical images with a novel two-stage
+approach without relying on any labeled data at any stage. The first stage
+involves the creation of a "pixel-centered patch" that embeds every pixel along
+with its surrounding patch, using a vector representation in a high-dimensional
+latent embedding space. The second stage utilizes diffusion condensation, a
+multi-scale topological data analysis approach, to dynamically coarse-grain
+these embedding vectors at all levels of granularity. The final outcome is a
+series of coarse-to-fine segmentations that highlight image structures at
+various scales. In this work, we show successful multi-scale segmentation on
+natural images, retinal fundus images, and brain MRI images. Our framework
+delineates structures and patterns at different scales which, in the cases of
+medical images, may carry distinct information relevant to clinical
+interpretation. Quantitatively, our framework demonstrates improvements ranging
+from 10% to 200% on dice coefficient and Hausdorff distance compared to
+existing unsupervised methods across three medical image datasets. As we tackle
+the problem of segmenting medical images at multiple meaningful granularities
+without relying on any label, we hope to demonstrate the possibility to
+circumvent tedious and repetitive manual annotations in future practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Included new dataset. Ensured evaluation consistency among competing
+  methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLEDA -- Lifelong <span class="highlight-title">Self-Supervised</span> Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09027v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09027v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamatha Thota, Dewei Yi, Georgios Leontidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans and animals have the ability to continuously learn new information
+over their lifetime without losing previously acquired knowledge. However,
+artificial neural networks struggle with this due to new information
+conflicting with old knowledge, resulting in catastrophic forgetting. The
+complementary learning systems (CLS) theory suggests that the interplay between
+hippocampus and neocortex systems enables long-term and efficient learning in
+the mammalian brain, with memory replay facilitating the interaction between
+these two systems to reduce forgetting. The proposed Lifelong Self-Supervised
+Domain Adaptation (LLEDA) framework draws inspiration from the CLS theory and
+mimics the interaction between two networks: a DA network inspired by the
+hippocampus that quickly adjusts to changes in data distribution and an SSL
+network inspired by the neocortex that gradually learns domain-agnostic general
+representations. LLEDA's latent replay technique facilitates communication
+between these two networks by reactivating and replaying the past memory latent
+representations to stabilise long-term generalisation and retention without
+interfering with the previously learned information. Extensive experiments
+demonstrate that the proposed method outperforms several other methods
+resulting in a long-term adaptation while being less prone to catastrophic
+forgetting when transferred to new domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures, 6 tables; V2 added more experiments on more
+  domains and fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenFlamingo: An Open-Source Framework for Training Large Autoregressive
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OpenFlamingo, a family of autoregressive vision-language models
+ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce
+an open-source replication of DeepMind's Flamingo models. On seven
+vision-language datasets, OpenFlamingo models average between 80 - 89% of
+corresponding Flamingo performance. This technical report describes our models,
+training data, hyperparameters, and evaluation suite. We share our models and
+code at https://github.com/mlfoundations/open_flamingo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusing VHR Post-disaster Aerial Imagery and LiDAR Data for Roof
+  Classification in the Caribbean using CNNs <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isabelle Tingzon, Nuala Margaret Cowan, Pierre Chrzanowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and up-to-date information on building characteristics is essential
+for vulnerability assessment; however, the high costs and long timeframes
+associated with conducting traditional field surveys can be an obstacle to
+obtaining critical exposure datasets needed for disaster risk management. In
+this work, we leverage deep learning techniques for the automated
+classification of roof characteristics from very high-resolution orthophotos
+and airborne LiDAR data obtained in Dominica following Hurricane Maria in 2017.
+We demonstrate that the fusion of multimodal earth observation data performs
+better than using any single data source alone. Using our proposed methods, we
+achieve F1 scores of 0.93 and 0.92 for roof type and roof material
+classification, respectively. This work is intended to help governments produce
+more timely building information to improve resilience and disaster response in
+the Caribbean.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ICCV Humanitarian Assistance and Disaster Response Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ POAR: Towards Open Vocabulary Pedestrian Attribute Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Zhang, Suchen Wang, Shichao Kan, Zhenyu Weng, Yigang Cen, Yap-peng Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pedestrian attribute recognition (PAR) aims to predict the attributes of a
+target pedestrian in a surveillance system. Existing methods address the PAR
+problem by training a multi-label classifier with predefined attribute classes.
+However, it is impossible to exhaust all pedestrian attributes in the real
+world. To tackle this problem, we develop a novel pedestrian open-attribute
+recognition (POAR) framework. Our key idea is to formulate the POAR problem as
+an image-text search problem. We design a Transformer-based image encoder with
+a masking strategy. A set of attribute tokens are introduced to focus on
+specific pedestrian parts (e.g., head, upper body, lower body, feet, etc.) and
+encode corresponding attributes into visual embeddings. Each attribute category
+is described as a natural language sentence and encoded by the text encoder.
+Then, we compute the similarity between the visual and text embeddings of
+attributes to find the best attribute descriptions for the input images.
+Different from existing methods that learn a specific classifier for each
+attribute category, we model the pedestrian at a part-level and explore the
+searching method to handle the unseen attributes. Finally, a many-to-many
+contrastive (MTMC) loss with masked tokens is proposed to train the network
+since a pedestrian image can comprise multiple attributes. Extensive
+experiments have been conducted on benchmark PAR datasets with an
+open-attribute setting. The results verified the effectiveness of the proposed
+POAR method, which can form a strong baseline for the POAR task. Our code is
+available at \url{https://github.com/IvyYZ/POAR}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation
+  from Simulation to multiple Real-World Domains <span class="chip">NeurIPS
+  2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08083v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08083v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Gebele, Bonifaz Stuhr, Johann Haselberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation demonstrates great potential to mitigate
+domain shifts by transferring models from labeled source domains to unlabeled
+target domains. While Unsupervised Domain Adaptation has been applied to a wide
+variety of complex vision tasks, only few works focus on lane detection for
+autonomous driving. This can be attributed to the lack of publicly available
+datasets. To facilitate research in these directions, we propose CARLANE, a
+3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE
+encompasses the single-target datasets MoLane and TuLane and the multi-target
+dataset MuLane. These datasets are built from three different domains, which
+cover diverse scenes and contain a total of 163K unique images, 118K of which
+are annotated. In addition we evaluate and report systematic baselines,
+including our own method, which builds upon Prototypical Cross-domain
+Self-supervised Learning. We find that false positive and false negative rates
+of the evaluated domain adaptation methods are high compared to those of fully
+supervised baselines. This affirms the need for benchmarks such as CARLANE to
+further strengthen research in Unsupervised Domain Adaptation for lane
+detection. CARLANE, all evaluated models and the corresponding implementations
+are publicly available at https://carlanebenchmark.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36th Conference on Neural Information Processing Systems (NeurIPS
+  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Realistic Data Enrichment for Robust Image Segmentation in
+  Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Cechnicka, James Ball, Hadrien Reynaud, Callum Arthurs, Candice Roufosse, Bernhard Kainz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Poor performance of quantitative analysis in histopathological Whole Slide
+Images (WSI) has been a significant obstacle in clinical practice. Annotating
+large-scale WSIs manually is a demanding and time-consuming task, unlikely to
+yield the expected results when used for fully supervised learning systems.
+Rarely observed disease patterns and large differences in object scales are
+difficult to model through conventional patient intake. Prior methods either
+fall back to direct disease classification, which only requires learning a few
+factors per image, or report on average image segmentation performance, which
+is highly biased towards majority observations. Geometric image augmentation is
+commonly used to improve robustness for average case predictions and to enrich
+limited datasets. So far no method provided sampling of a realistic posterior
+distribution to improve stability, e.g. for the segmentation of imbalanced
+objects within images. Therefore, we propose a new approach, based on diffusion
+models, which can enrich an imbalanced dataset with plausible examples from
+underrepresented groups by conditioning on segmentation maps. Our method can
+simply expand limited clinical datasets making them suitable to train machine
+learning pipelines, and provides an interpretable and human-controllable way of
+generating histopathology images that are indistinguishable from real ones to
+human experts. We validate our findings on two datasets, one from the public
+domain and one from a Kidney Transplant study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IML-ViT: Benchmarking Image Manipulation Localization by Vision
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14863v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14863v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochen Ma, Bo Du, Zhuohang Jiang, Ahmed Y. Al Hammadi, Jizhe Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced image tampering techniques are increasingly challenging the
+trustworthiness of multimedia, leading to the development of Image Manipulation
+Localization (IML). But what makes a good IML model? The answer lies in the way
+to capture artifacts. Exploiting artifacts requires the model to extract
+non-semantic discrepancies between manipulated and authentic regions,
+necessitating explicit comparisons between the two areas. With the
+self-attention mechanism, naturally, the Transformer should be a better
+candidate to capture artifacts. However, due to limited datasets, there is
+currently no pure ViT-based approach for IML to serve as a benchmark, and CNNs
+dominate the entire task. Nevertheless, CNNs suffer from weak long-range and
+non-semantic modeling. To bridge this gap, based on the fact that artifacts are
+sensitive to image resolution, amplified under multi-scale features, and
+massive at the manipulation border, we formulate the answer to the former
+question as building a ViT with high-resolution capacity, multi-scale feature
+extraction capability, and manipulation edge supervision that could converge
+with a small amount of data. We term this simple but effective ViT paradigm
+IML-ViT, which has significant potential to become a new benchmark for IML.
+Extensive experiments on five benchmark datasets verified our model outperforms
+the state-of-the-art manipulation localization methods.Code and models are
+available at \url{https://github.com/SunnyHaze/IML-ViT}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Study on the Impact of Data Augmentation for Training Convolutional
+  Neural Networks in the Presence of Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11176v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11176v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emeson Santana, Gustavo Carneiro, Filipe R. Cordeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label noise is common in large real-world datasets, and its presence harms
+the training process of deep neural networks. Although several works have
+focused on the training strategies to address this problem, there are few
+studies that evaluate the impact of data augmentation as a design choice for
+training deep neural networks. In this work, we analyse the model robustness
+when using different data augmentations and their improvement on the training
+with the presence of noisy labels. We evaluate state-of-the-art and classical
+data augmentation strategies with different levels of synthetic noise for the
+datasets MNist, CIFAR-10, CIFAR-100, and the real-world dataset Clothing1M. We
+evaluate the methods using the accuracy metric. Results show that the
+appropriate selection of data augmentation can drastically improve the model
+robustness to label noise, increasing up to 177.84% of relative best test
+accuracy compared to the baseline with no augmentation, and an increase of up
+to 6% in absolute value with the state-of-the-art DivideMix training strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at SIBGRAPI 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Abductive Action Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13984v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13984v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clement Tan, Chai Kiat Yeo, Cheston Tan, Basura Fernando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abductive reasoning aims to make the most likely inference for a given set of
+incomplete observations. In this paper, we introduce a novel research task
+known as "abductive action inference" which addresses the question of which
+actions were executed by a human to reach a specific state shown in a single
+snapshot. The research explores three key abductive inference problems: action
+set prediction, action sequence prediction, and abductive action verification.
+To tackle these challenging tasks, we investigate various models, including
+established ones such as Transformers, Graph Neural Networks, CLIP, BLIP, GPT3,
+end-to-end trained Slow-Fast, Resnet50-3D, and ViT models. Furthermore, the
+paper introduces several innovative models tailored for abductive action
+inference, including a relational graph neural network, a relational bilinear
+pooling model, a relational rule-based inference model, a relational GPT-3
+prompt method, and a relational Transformer model. Notably, the newly proposed
+object-relational bilinear graph encoder-decoder (BiGED) model emerges as the
+most effective among all methods evaluated, demonstrating good proficiency in
+handling the intricacies of the Action Genome dataset. The contributions of
+this research offer significant progress toward comprehending the implications
+of human actions and making highly plausible inferences concerning the outcomes
+of these actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extreme Image Compression using Fine-tuned VQGAN Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Mao, Tinghan Yang, Yinuo Zhang, Shuyin Pan, Meng Wang, Shiqi Wang, Siwei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in generative compression methods have demonstrated
+remarkable progress in enhancing the perceptual quality of compressed data,
+especially in scenarios with low bitrates. Nevertheless, their efficacy and
+applicability in achieving extreme compression ratios ($<0.1$ bpp) still remain
+constrained. In this work, we propose a simple yet effective coding framework
+by introducing vector quantization (VQ)-based generative models into the image
+compression domain. The main insight is that the codebook learned by the VQGAN
+model yields strong expressive capacity, facilitating efficient compression of
+continuous information in the latent space while maintaining reconstruction
+quality. Specifically, an image can be represented as VQ-indices by finding the
+nearest codeword, which can be encoded using lossless compression methods into
+bitstreams. We then propose clustering a pre-trained large-scale codebook into
+smaller codebooks using the K-means algorithm. This enables images to be
+represented as diverse ranges of VQ-indices maps, resulting in variable
+bitrates and different levels of reconstruction quality. Extensive qualitative
+and quantitative experiments on various datasets demonstrate that the proposed
+framework outperforms the state-of-the-art codecs in terms of perceptual
+quality-oriented metrics and human perception under extremely low bitrates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generative Compression, Extreme Compression, VQGANs, Low Bitrate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Botet Colomer, Pier Luigi Dovesi, Theodoros Panagiotakopoulos, Joao Frederico Carvalho, Linus Härenstam-Nielsen, Hossein Azizpour, Hedvig Kjellström, Daniel Cremers, Matteo Poggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Online Domain Adaptation for semantic segmentation is to handle
+unforeseeable domain changes that occur during deployment, like sudden weather
+events. However, the high computational costs associated with brute-force
+adaptation make this paradigm unfeasible for real-world applications. In this
+paper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training
+framework for real-time domain adaptation. Our approach includes a
+hardware-aware back-propagation orchestration agent (HAMT) and a dedicated
+domain-shift detector that enables active control over when and how the model
+is adapted (LT). Thanks to these advancements, our approach is capable of
+performing semantic segmentation while simultaneously adapting at more than
+29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and
+speed trade-off is demonstrated on OnDA and SHIFT benchmarks through
+experimental results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. The first two authors contributed equally. Project page:
+  https://marcbotet.github.io/hamlet-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyper-Connected <span class="highlight-title">Transformer</span> Network for Multi-Modality PET-CT
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.15808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.15808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Bi, Michael Fulham, Shaoli Song, David Dagan Feng, Jinman Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  [18F]-Fluorodeoxyglucose (FDG) positron emission tomography - computed
+tomography (PET-CT) has become the imaging modality of choice for diagnosing
+many cancers. Co-learning complementary PET-CT imaging features is a
+fundamental requirement for automatic tumor segmentation and for developing
+computer aided cancer diagnosis systems. In this study, we propose a
+hyper-connected transformer (HCT) network that integrates a transformer network
+(TN) with a hyper connected fusion for multi-modality PET-CT images. The TN was
+leveraged for its ability to provide global dependencies in image feature
+learning, which was achieved by using image patch embeddings with a
+self-attention mechanism to capture image-wide contextual information. We
+extended the single-modality definition of TN with multiple TN based branches
+to separately extract image features. We also introduced a hyper connected
+fusion to fuse the contextual and complementary image features across multiple
+transformers in an iterative manner. Our results with two clinical datasets
+show that HCT achieved better performance in segmentation accuracy when
+compared to the existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMBC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MixPro: Data Augmentation with MaskMix and Progressive Attention
+  Labeling for Vision <span class="highlight-title">Transformer</span> <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qihao Zhao, Yangyu Huang, Wei Hu, Fan Zhang, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed data augmentation TransMix employs attention labels to
+help visual transformers (ViT) achieve better robustness and performance.
+However, TransMix is deficient in two aspects: 1) The image cropping method of
+TransMix may not be suitable for ViTs. 2) At the early stage of training, the
+model produces unreliable attention maps. TransMix uses unreliable attention
+maps to compute mixed attention labels that can affect the model. To address
+the aforementioned issues, we propose MaskMix and Progressive Attention
+Labeling (PAL) in image and label space, respectively. In detail, from the
+perspective of image space, we design MaskMix, which mixes two images based on
+a patch-like grid mask. In particular, the size of each mask patch is
+adjustable and is a multiple of the image patch size, which ensures each image
+patch comes from only one image and contains more global contents. From the
+perspective of label space, we design PAL, which utilizes a progressive factor
+to dynamically re-weight the attention weights of the mixed attention label.
+Finally, we combine MaskMix and Progressive Attention Labeling as our new data
+augmentation method, named MixPro. The experimental results show that our
+method can improve various ViT-based models at scales on ImageNet
+classification (73.8\% top-1 accuracy based on DeiT-T for 300 epochs). After
+being pre-trained with MixPro on ImageNet, the ViT-based models also
+demonstrate better transferability to semantic segmentation, object detection,
+and instance segmentation. Furthermore, compared to TransMix, MixPro also shows
+stronger robustness on several benchmarks. The code is available at
+https://github.com/fistyee/MixPro.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023, 16 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Radiology Report Generation by Learning with Increasingly Hard
+  Negatives <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhanu Prakash Voutharoja, Lei Wang, Luping Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radiology report generation is challenging as medical images or
+reports are usually similar to each other due to the common content of anatomy.
+This makes a model hard to capture the uniqueness of individual images and is
+prone to producing undesired generic or mismatched reports. This situation
+calls for learning more discriminative features that could capture even
+fine-grained mismatches between images and reports. To achieve this, this paper
+proposes a novel framework to learn discriminative image and report features by
+distinguishing them from their closest peers, i.e., hard negatives. Especially,
+to attain more discriminative features, we gradually raise the difficulty of
+such a learning task by creating increasingly hard negative reports for each
+image in the feature space during training, respectively. By treating the
+increasingly hard negatives as auxiliary variables, we formulate this process
+as a min-max alternating optimisation problem. At each iteration, conditioned
+on a given set of hard negative reports, image and report features are learned
+as usual by minimising the loss functions related to report generation. After
+that, a new set of harder negative reports will be created by maximising a loss
+reflecting image-report alignment. By solving this optimisation, we attain a
+model that can generate more specific and accurate reports. It is noteworthy
+that our framework enhances discriminative feature learning without introducing
+extra network weights. Also, in contrast to the existing way of generating hard
+negatives, our framework extends beyond the granularity of the dataset by
+generating harder samples out of the training set. Experimental study on
+benchmark datasets verifies the efficacy of our framework and shows that it can
+serve as a plug-in to readily improve existing medical report generation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to European Conference on Artificial Intelligence (ECAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Deep Generative Models with Generalized Empirical
+  Likelihoods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suman Ravuri, Mélanie Rey, Shakir Mohamed, Marc Deisenroth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how well a deep generative model captures a distribution of
+high-dimensional data remains an important open challenge. It is especially
+difficult for certain model classes, such as Generative Adversarial Networks
+and Diffusion Models, whose models do not admit exact likelihoods. In this
+work, we demonstrate that generalized empirical likelihood (GEL) methods offer
+a family of diagnostic tools that can identify many deficiencies of deep
+generative models (DGMs). We show, with appropriate specification of moment
+conditions, that the proposed method can identify which modes have been
+dropped, the degree to which DGMs are mode imbalanced, and whether DGMs
+sufficiently capture intra-class diversity. We show how to combine techniques
+from Maximum Mean Discrepancy and Generalized Empirical Likelihood to create
+not only distribution tests that retain per-sample interpretability, but also
+metrics that include label information. We find that such tests predict the
+degree of mode dropping and mode imbalance up to 60% better than metrics such
+as improved precision/recall. We provide an implementation at
+https://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of
+  submissions)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SceneGATE: Scene-Graph based co-Attention networks for TExt visual
+  question answering <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08283v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08283v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feiqi Cao, Siwen Luo, Felipe Nunez, Zean Wen, Josiah Poon, Caren Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most TextVQA approaches focus on the integration of objects, scene texts and
+question words by a simple transformer encoder. But this fails to capture the
+semantic relations between different modalities. The paper proposes a Scene
+Graph based co-Attention Network (SceneGATE) for TextVQA, which reveals the
+semantic relations among the objects, Optical Character Recognition (OCR)
+tokens and the question words. It is achieved by a TextVQA-based scene graph
+that discovers the underlying semantics of an image. We created a
+guided-attention module to capture the intra-modal interplay between the
+language and the vision as a guidance for inter-modal interactions. To make
+explicit teaching of the relations between the two modalities, we proposed and
+integrated two attention modules, namely a scene graph-based semantic
+relation-aware attention and a positional relation-aware attention. We
+conducted extensive experiments on two benchmark datasets, Text-VQA and ST-VQA.
+It is shown that our SceneGATE method outperformed existing ones because of the
+scene graph and its attention modules.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Robotics (Q1, SCI indexed Journal):
+  https://www.mdpi.com/2218-6581/12/4/114</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imperceptible Physical Attack against Face Recognition Systems via LED
+  Illumination Modulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbin Fang, Canjian Jiang, You Jiang, Puxi Lin, Zhaojie Chen, Yujing Sun, Siu-Ming Yiu, Zoe L. Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although face recognition starts to play an important role in our daily life,
+we need to pay attention that data-driven face recognition vision systems are
+vulnerable to adversarial attacks. However, the current two categories of
+adversarial attacks, namely digital attacks and physical attacks both have
+drawbacks, with the former ones impractical and the latter one conspicuous,
+high-computational and inexecutable. To address the issues, we propose a
+practical, executable, inconspicuous and low computational adversarial attack
+based on LED illumination modulation. To fool the systems, the proposed attack
+generates imperceptible luminance changes to human eyes through fast intensity
+modulation of scene LED illumination and uses the rolling shutter effect of
+CMOS image sensors in face recognition systems to implant luminance information
+perturbation to the captured face images. In summary,we present a
+denial-of-service (DoS) attack for face detection and a dodging attack for face
+verification. We also evaluate their effectiveness against well-known face
+detection models, Dlib, MTCNN and RetinaFace , and face verification models,
+Dlib, FaceNet,and ArcFace.The extensive experiments show that the success rates
+of DoS attacks against face detection models reach 97.67%, 100%, and 100%,
+respectively, and the success rates of dodging attacks against all face
+verification models reach 100%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mimic3D: Thriving 3D-Aware GANs via 3D-to-2D Imitation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09036v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09036v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Chen, Yu Deng, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating images with both photorealism and multiview 3D consistency is
+crucial for 3D-aware GANs, yet existing methods struggle to achieve them
+simultaneously. Improving the photorealism via CNN-based 2D super-resolution
+can break the strict 3D consistency, while keeping the 3D consistency by
+learning high-resolution 3D representations for direct rendering often
+compromises image quality. In this paper, we propose a novel learning strategy,
+namely 3D-to-2D imitation, which enables a 3D-aware GAN to generate
+high-quality images while maintaining their strict 3D consistency, by letting
+the images synthesized by the generator's 3D rendering branch to mimic those
+generated by its 2D super-resolution branch. We also introduce 3D-aware
+convolutions into the generator for better 3D representation learning, which
+further improves the image generation quality. With the above strategies, our
+method reaches FID scores of 5.4 and 4.3 on FFHQ and AFHQ-v2 Cats,
+respectively, at 512x512 resolution, largely outperforming existing 3D-aware
+GANs using direct 3D rendering and coming very close to the previous
+state-of-the-art method that leverages 2D super-resolution. Project website:
+https://seanchenxy.github.io/Mimic3DWeb.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEVControl: Accurately Controlling Street-view Elements with
+  Multi-perspective Consistency via BEV Sketch Layout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01661v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01661v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kairui Yang, Enhui Ma, Jibin Peng, Qing Guo, Di Lin, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using synthesized images to boost the performance of perception models is a
+long-standing research challenge in computer vision. It becomes more eminent in
+visual-centric autonomous driving systems with multi-view cameras as some
+long-tail scenarios can never be collected. Guided by the BEV segmentation
+layouts, the existing generative networks seem to synthesize photo-realistic
+street-view images when evaluated solely on scene-level metrics. However, once
+zoom-in, they usually fail to produce accurate foreground and background
+details such as heading. To this end, we propose a two-stage generative method,
+dubbed BEVControl, that can generate accurate foreground and background
+contents. In contrast to segmentation-like input, it also supports sketch style
+input, which is more flexible for humans to edit. In addition, we propose a
+comprehensive multi-level evaluation protocol to fairly compare the quality of
+the generated scene, foreground object, and background geometry. Our extensive
+experiments show that our BEVControl surpasses the state-of-the-art method,
+BEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation
+mIoU. In addition, we show that using images generated by BEVControl to train
+the downstream perception model, it achieves on average 1.29 improvement in NDS
+score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EasyNet: An Easy Network for 3D Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13925v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13925v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Chen, Guoyang Xie, Jiaqi Liu, Jinbao Wang, Ziqi Luo, Jinfan Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D anomaly detection is an emerging and vital computer vision task in
+industrial manufacturing (IM). Recently many advanced algorithms have been
+published, but most of them cannot meet the needs of IM. There are several
+disadvantages: i) difficult to deploy on production lines since their
+algorithms heavily rely on large pre-trained models; ii) hugely increase
+storage overhead due to overuse of memory banks; iii) the inference speed
+cannot be achieved in real-time. To overcome these issues, we propose an easy
+and deployment-friendly network (called EasyNet) without using pre-trained
+models and memory banks: firstly, we design a multi-scale multi-modality
+feature encoder-decoder to accurately reconstruct the segmentation maps of
+anomalous regions and encourage the interaction between RGB images and depth
+images; secondly, we adopt a multi-modality anomaly segmentation network to
+achieve a precise anomaly map; thirdly, we propose an attention-based
+information entropy fusion module for feature fusion during inference, making
+it suitable for real-time deployment. Extensive experiments show that EasyNet
+achieves an anomaly detection AUROC of 92.6% without using pre-trained models
+and memory banks. In addition, EasyNet is faster than existing methods, with a
+high frame rate of 94.55 FPS on a Tesla V100 GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Guided MLP-Mixer for Skeleton-Based Human Motion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03532v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03532v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinshun Wang, Qiongjie Cui, Chen Chen, Shen Zhao, Mengyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Graph Convolutional Networks (GCNs) have been widely used in
+human motion prediction, but their performance remains unsatisfactory.
+Recently, MLP-Mixer, initially developed for vision tasks, has been leveraged
+into human motion prediction as a promising alternative to GCNs, which achieves
+both better performance and better efficiency than GCNs. Unlike GCNs, which can
+explicitly capture human skeleton's bone-joint structure by representing it as
+a graph with edges and nodes, MLP-Mixer relies on fully connected layers and
+thus cannot explicitly model such graph-like structure of human's. To break
+this limitation of MLP-Mixer's, we propose \textit{Graph-Guided Mixer}, a novel
+approach that equips the original MLP-Mixer architecture with the capability to
+model graph structure. By incorporating graph guidance, our
+\textit{Graph-Guided Mixer} can effectively capture and utilize the specific
+connectivity patterns within human skeleton's graph representation. In this
+paper, first we uncover a theoretical connection between MLP-Mixer and GCN that
+is unexplored in existing research. Building on this theoretical connection,
+next we present our proposed \textit{Graph-Guided Mixer}, explaining how the
+original MLP-Mixer architecture is reinvented to incorporate guidance from
+graph structure. Then we conduct an extensive evaluation on the Human3.6M,
+AMASS, and 3DPW datasets, which shows that our method achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and
+  Synthetic-to-Real Adaptation <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10510v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10510v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ying Yeh, Koki Nagano, Sameh Khamis, Jan Kautz, Ming-Yu Liu, Ting-Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a portrait image of a person and an environment map of the target
+lighting, portrait relighting aims to re-illuminate the person in the image as
+if the person appeared in an environment with the target lighting. To achieve
+high-quality results, recent methods rely on deep learning. An effective
+approach is to supervise the training of deep neural networks with a
+high-fidelity dataset of desired input-output pairs, captured with a light
+stage. However, acquiring such data requires an expensive special capture rig
+and time-consuming efforts, limiting access to only a few resourceful
+laboratories. To address the limitation, we propose a new approach that can
+perform on par with the state-of-the-art (SOTA) relighting methods without
+requiring a light stage. Our approach is based on the realization that a
+successful relighting of a portrait image depends on two conditions. First, the
+method needs to mimic the behaviors of physically-based relighting. Second, the
+output has to be photorealistic. To meet the first condition, we propose to
+train the relighting network with training data generated by a virtual light
+stage that performs physically-based rendering on various 3D synthetic humans
+under different environment maps. To meet the second condition, we develop a
+novel synthetic-to-real approach to bring photorealism to the relighting
+network output. In addition to achieving SOTA results, our approach offers
+several advantages over the prior methods, including controllable glares on
+glasses and more temporally-consistent results for relighting videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21
+  pages, 25 figures, 7 tables. Project page:
+  https://research.nvidia.com/labs/dir/lumos/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-Vocabulary Semantic Segmentation with Decoupled One-Pass Network <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01198v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01198v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Han, Yujie Zhong, Dengjie Li, Kai Han, Lin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the open-vocabulary semantic segmentation problem has attracted
+increasing attention and the best performing methods are based on two-stream
+networks: one stream for proposal mask generation and the other for segment
+classification using a pretrained visual-language model. However, existing
+two-stream methods require passing a great number of (up to a hundred) image
+crops into the visual-language model, which is highly inefficient. To address
+the problem, we propose a network that only needs a single pass through the
+visual-language model for each input image. Specifically, we first propose a
+novel network adaptation approach, termed patch severance, to restrict the
+harmful interference between the patch embeddings in the pre-trained visual
+encoder. We then propose classification anchor learning to encourage the
+network to spatially focus on more discriminative features for classification.
+Extensive experiments demonstrate that the proposed method achieves outstanding
+performance, surpassing state-of-the-art methods while being 4 to 7 times
+faster at inference. Code: https://github.com/CongHan0808/DeOP.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Benefits of 3D Pose and Tracking for Human Action Recognition <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jathushan Rajasegaran, Georgios Pavlakos, Angjoo Kanazawa, Christoph Feichtenhofer, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we study the benefits of using tracking and 3D poses for action
+recognition. To achieve this, we take the Lagrangian view on analysing actions
+over a trajectory of human motion rather than at a fixed point in space. Taking
+this stand allows us to use the tracklets of people to predict their actions.
+In this spirit, first we show the benefits of using 3D pose to infer actions,
+and study person-person interactions. Subsequently, we propose a Lagrangian
+Action Recognition model by fusing 3D pose and contextualized appearance over
+tracklets. To this end, our method achieves state-of-the-art performance on the
+AVA v2.2 dataset on both pose only settings and on standard benchmark settings.
+When reasoning about the action using only pose cues, our pose model achieves
++10.0 mAP gain over the corresponding state-of-the-art while our fused model
+has a gain of +2.8 mAP over the best state-of-the-art model. Code and results
+are available at: https://brjathu.github.io/LART
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2023 (project page: https://brjathu.github.io/LART)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Making Images Real Again: A Comprehensive <span class="highlight-title">Survey</span> on Deep Image
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.14490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.14490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Niu, Wenyan Cong, Liu Liu, Yan Hong, Bo Zhang, Jing Liang, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a common image editing operation, image composition aims to combine the
+foreground from one image and another background image, resulting in a
+composite image. However, there are many issues that could make the composite
+images unrealistic. These issues can be summarized as the inconsistency between
+foreground and background, which includes appearance inconsistency (e.g.,
+incompatible illumination), geometry inconsistency (e.g., unreasonable size),
+and semantic inconsistency (e.g., mismatched semantic context). Image
+composition task could be decomposed into multiple sub-tasks, in which each
+sub-task targets at one or more issues. Specifically, object placement aims to
+find reasonable scale, location, and shape for the foreground. Image blending
+aims to address the unnatural boundary between foreground and background. Image
+harmonization aims to adjust the illumination statistics of foreground. Shadow
+generation aims to generate plausible shadow for the foreground. These
+sub-tasks can be executed sequentially or parallelly to acquire realistic
+composite images. To the best of our knowledge, there is no previous survey on
+image composition. In this paper, we conduct comprehensive survey over the
+sub-tasks and combinatorial task of image composition. For each one, we
+summarize the existing methods, available datasets, and common evaluation
+metrics. Datasets and codes for image composition are summarized at
+https://github.com/bcmi/Awesome-Image-Composition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DDG-Net: Discriminability-Driven Graph Network for Weakly-supervised
+  Temporal Action Localization <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojun Tang, Junsong Fan, Chuanchen Luo, Zhaoxiang Zhang, Man Zhang, Zongyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised temporal action localization (WTAL) is a practical yet
+challenging task. Due to large-scale datasets, most existing methods use a
+network pretrained in other datasets to extract features, which are not
+suitable enough for WTAL. To address this problem, researchers design several
+modules for feature enhancement, which improve the performance of the
+localization module, especially modeling the temporal relationship between
+snippets. However, all of them neglect the adverse effects of ambiguous
+information, which would reduce the discriminability of others. Considering
+this phenomenon, we propose Discriminability-Driven Graph Network (DDG-Net),
+which explicitly models ambiguous snippets and discriminative snippets with
+well-designed connections, preventing the transmission of ambiguous information
+and enhancing the discriminability of snippet-level representations.
+Additionally, we propose feature consistency loss to prevent the assimilation
+of features and drive the graph convolution network to generate more
+discriminative representations. Extensive experiments on THUMOS14 and
+ActivityNet1.2 benchmarks demonstrate the effectiveness of DDG-Net,
+establishing new state-of-the-art results on both datasets. Source code is
+available at \url{https://github.com/XiaojunTang22/ICCV2023-DDGNet}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural
+  Radiance Fields <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungwon Hwang, Junha Hyung, Daejin Kim, Min-Jung Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As recent advances in Neural Radiance Fields (NeRF) have enabled
+high-fidelity 3D face reconstruction and novel view synthesis, its manipulation
+also became an essential task in 3D vision. However, existing manipulation
+methods require extensive human labor, such as a user-provided semantic mask
+and manual attribute search unsuitable for non-expert users. Instead, our
+approach is designed to require a single text to manipulate a face
+reconstructed with NeRF. To do so, we first train a scene manipulator, a latent
+code-conditional deformable NeRF, over a dynamic scene to control a face
+deformation using the latent code. However, representing a scene deformation
+with a single latent code is unfavorable for compositing local deformations
+observed in different instances. As so, our proposed Position-conditional
+Anchor Compositor (PAC) learns to represent a manipulated scene with spatially
+varying latent codes. Their renderings with the scene manipulator are then
+optimized to yield high cosine similarity to a target text in CLIP embedding
+space for text-driven manipulation. To the best of our knowledge, our approach
+is the first to address the text-driven manipulation of a face reconstructed
+with NeRF. Extensive results, comparisons, and ablation studies demonstrate the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I-ViT: Integer-only Quantization for Efficient Vision <span class="highlight-title">Transformer</span>
+  Inference <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.01405v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.01405v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Qingyi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) have achieved state-of-the-art performance on
+various computer vision applications. However, these models have considerable
+storage and computational overheads, making their deployment and efficient
+inference on edge devices challenging. Quantization is a promising approach to
+reducing model complexity, and the dyadic arithmetic pipeline can allow the
+quantized models to perform efficient integer-only inference. Unfortunately,
+dyadic arithmetic is based on the homogeneity condition in convolutional neural
+networks, which is not applicable to the non-linear components in ViTs, making
+integer-only inference of ViTs an open issue. In this paper, we propose I-ViT,
+an integer-only quantization scheme for ViTs, to enable ViTs to perform the
+entire computational graph of inference with integer arithmetic and
+bit-shifting, and without any floating-point arithmetic. In I-ViT, linear
+operations (e.g., MatMul and Dense) follow the integer-only pipeline with
+dyadic arithmetic, and non-linear operations (e.g., Softmax, GELU, and
+LayerNorm) are approximated by the proposed light-weight integer-only
+arithmetic methods. More specifically, I-ViT applies the proposed Shiftmax and
+ShiftGELU, which are designed to use integer bit-shifting to approximate the
+corresponding floating-point operations. We evaluate I-ViT on various benchmark
+models and the results show that integer-only INT8 quantization achieves
+comparable (or even slightly higher) accuracy to the full-precision (FP)
+baseline. Furthermore, we utilize TVM for practical hardware deployment on the
+GPU's integer arithmetic units, achieving 3.72$\sim$4.11$\times$ inference
+speedup compared to the FP model. Code of both Pytorch and TVM is released at
+https://github.com/zkkli/I-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Multi-modal and Multi-hop Question Answering via Structured
+  Knowledge and Unified Retrieval-Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Yang, Qian Chen, Wen Wang, Baotian Hu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal multi-hop question answering involves answering a question by
+reasoning over multiple input sources from different modalities. Existing
+methods often retrieve evidences separately and then use a language model to
+generate an answer based on the retrieved evidences, and thus do not adequately
+connect candidates and are unable to model the interdependent relations during
+retrieval. Moreover, the pipelined approaches of retrieval and generation might
+result in poor generation performance when retrieval performance is low. To
+address these issues, we propose a Structured Knowledge and Unified
+Retrieval-Generation (SKURG) approach. SKURG employs an Entity-centered Fusion
+Encoder to align sources from different modalities using shared entities. It
+then uses a unified Retrieval-Generation Decoder to integrate intermediate
+retrieval results for answer generation and also adaptively determine the
+number of retrieval steps. Extensive experiments on two representative
+multi-modal multi-hop QA datasets MultimodalQA and WebQA demonstrate that SKURG
+outperforms the state-of-the-art models in both source retrieval and answer
+generation performance with fewer parameters. Our code is available at
+https://github.com/HITsz-TMG/SKURG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepQ-ViT: Scale Reparameterization for Post-Training Quantization of
+  Vision <span class="highlight-title">Transformer</span>s <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08254v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08254v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Junrui Xiao, Lianwei Yang, Qingyi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ), which only requires a tiny dataset for
+calibration without end-to-end retraining, is a light and practical model
+compression technique. Recently, several PTQ schemes for vision transformers
+(ViTs) have been presented; unfortunately, they typically suffer from
+non-trivial accuracy degradation, especially in low-bit cases. In this paper,
+we propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale
+reparameterization, to address the above issues. RepQ-ViT decouples the
+quantization and inference processes, where the former employs complex
+quantizers and the latter employs scale-reparameterized simplified quantizers.
+This ensures both accurate quantization and efficient inference, which
+distinguishes it from existing approaches that sacrifice quantization
+performance to meet the target hardware. More specifically, we focus on two
+components with extreme distributions: post-LayerNorm activations with severe
+inter-channel variation and post-Softmax activations with power-law features,
+and initially apply channel-wise quantization and log$\sqrt{2}$ quantization,
+respectively. Then, we reparameterize the scales to hardware-friendly
+layer-wise quantization and log2 quantization for inference, with only slight
+accuracy or computational costs. Extensive experiments are conducted on
+multiple vision tasks with different model variants, proving that RepQ-ViT,
+without hyperparameters and expensive reconstruction procedures, can outperform
+existing strong baselines and encouragingly improve the accuracy of 4-bit PTQ
+of ViTs to a usable level. Code is available at
+https://github.com/zkkli/RepQ-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust <span class="highlight-title">Self-Supervised</span> Extrinsic Self-Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takayuki Kanai, Igor Vasiljevic, Vitor Guizilini, Adrien Gaidon, Rares Ambrus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles and robots need to operate over a wide variety of
+scenarios in order to complete tasks efficiently and safely. Multi-camera
+self-supervised monocular depth estimation from videos is a promising way to
+reason about the environment, as it generates metrically scaled geometric
+predictions from visual data without requiring additional sensors. However,
+most works assume well-calibrated extrinsics to fully leverage this
+multi-camera setup, even though accurate and efficient calibration is still a
+challenging problem. In this work, we introduce a novel method for extrinsic
+calibration that builds upon the principles of self-supervised monocular depth
+and ego-motion learning. Our proposed curriculum learning strategy uses
+monocular depth and pose estimators with velocity supervision to estimate
+extrinsics, and then jointly learns extrinsic calibration along with depth and
+pose for a set of overlapping cameras rigidly attached to a moving vehicle.
+Experiments on a benchmark multi-camera dataset (DDAD) demonstrate that our
+method enables self-calibration in various scenes robustly and efficiently
+compared to a traditional vision-based pose estimation pipeline. Furthermore,
+we demonstrate the benefits of extrinsics self-calibration as a way to improve
+depth prediction via joint optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sites.google.com/view/tri-sesc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joo Chan Lee, Daniel Rho, Jong Hwan Ko, Eunbyung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural fields, also known as coordinate-based or implicit neural
+representations, have shown a remarkable capability of representing,
+generating, and manipulating various forms of signals. For video
+representations, however, mapping pixel-wise coordinates to RGB colors has
+shown relatively low compression performance and slow convergence and inference
+speed. Frame-wise video representation, which maps a temporal coordinate to its
+entire frame, has recently emerged as an alternative method to represent
+videos, improving compression rates and encoding speed. While promising, it has
+still failed to reach the performance of state-of-the-art video compression
+algorithms. In this work, we propose FFNeRV, a novel method for incorporating
+flow information into frame-wise representations to exploit the temporal
+redundancy across the frames in videos inspired by the standard video codecs.
+Furthermore, we introduce a fully convolutional architecture, enabled by
+one-dimensional temporal grids, improving the continuity of spatial features.
+Experimental results show that FFNeRV yields the best performance for video
+compression and frame interpolation among the methods using frame-wise
+representations or neural fields. To reduce the model size even further, we
+devise a more compact convolutional architecture using the group and pointwise
+convolutions. With model compression techniques, including quantization-aware
+training and entropy coding, FFNeRV outperforms widely-used standard video
+codecs (H.264 and HEVC) and performs on par with state-of-the-art video
+compression algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page including code is available at
+  https://maincold2.github.io/ffnerv/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Fine-Tuning of Deep Neural Networks with Hessian-based
+  Generalization Guarantees <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02659v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02659v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Ju, Dongyue Li, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider fine-tuning a pretrained deep neural network on a target task. We
+study the generalization properties of fine-tuning to understand the problem of
+overfitting, which has often been observed (e.g., when the target dataset is
+small or when the training labels are noisy). Existing generalization measures
+for deep networks depend on notions such as distance from the initialization
+(i.e., the pretrained network) of the fine-tuned model and noise stability
+properties of deep networks. This paper identifies a Hessian-based distance
+measure through PAC-Bayesian analysis, which is shown to correlate well with
+observed generalization gaps of fine-tuned models. Theoretically, we prove
+Hessian distance-based generalization bounds for fine-tuned models. We also
+describe an extended study of fine-tuning against label noise, where
+overfitting is against a critical problem; We present an algorithm and a
+generalization error guarantee for this algorithm under a class conditional
+independent noise model. Empirically, we observe that the Hessian-based
+distance measure can match the scale of the observed generalization gap of
+fine-tuned models in practice. We also test our algorithm on several image
+classification tasks with noisy training labels, showing notable gains over
+prior methods, and the Hessian distance measure of the fine-tuned model
+decreases substantially.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages. Appeared in ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning by Erasing: Conditional Entropy based Transferable
+  Out-Of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.11041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.11041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Xing, Zhiyong Feng, Yong Su, Changjae Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is essential to handle the distribution
+shifts between training and test scenarios. For a new in-distribution (ID)
+dataset, existing methods require retraining to capture the dataset-specific
+feature representation or data distribution. In this paper, we propose a deep
+generative models (DGM) based transferable OOD detection method, which is
+unnecessary to retrain on a new ID dataset. We design an image erasing strategy
+to equip exclusive conditional entropy distribution for each ID dataset, which
+determines the discrepancy of DGM's posteriori ucertainty distribution on
+different ID datasets. Owing to the powerful representation capacity of
+convolutional neural networks, the proposed model trained on complex dataset
+can capture the above discrepancy between ID datasets without retraining and
+thus achieve transferable OOD detection. We validate the proposed method on
+five datasets and verity that ours achieves comparable performance to the
+state-of-the-art group based OOD detection methods that need to be retrained to
+deploy on new ID datasets. Our code is available at
+https://github.com/oOHCIOo/CETOOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update new experimental results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Rendering for Synthetic Aperture Radar Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Wilmanski, Jonathan Tamir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is rising interest in differentiable rendering, which allows explicitly
+modeling geometric priors and constraints in optimization pipelines using
+first-order methods such as backpropagation. Incorporating such domain
+knowledge can lead to deep neural networks that are trained more robustly and
+with limited data, as well as the capability to solve ill-posed inverse
+problems. Existing efforts in differentiable rendering have focused on imagery
+from electro-optical sensors, particularly conventional RGB-imagery. In this
+work, we propose an approach for differentiable rendering of Synthetic Aperture
+Radar (SAR) imagery, which combines methods from 3D computer graphics with
+neural rendering. We demonstrate the approach on the inverse graphics problem
+of 3D Object Reconstruction from limited SAR imagery using high-fidelity
+simulated SAR data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the manuscript is an updated preprint which has been
+  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but
+  has not yet been published or processed by IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Graph Filtering Network for 3D Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaedul Islam, A. Ben Hamza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph convolutional networks (GCNs) have proven to be an effective approach
+for 3D human pose estimation. By naturally modeling the skeleton structure of
+the human body as a graph, GCNs are able to capture the spatial relationships
+between joints and learn an efficient representation of the underlying pose.
+However, most GCN-based methods use a shared weight matrix, making it
+challenging to accurately capture the different and complex relationships
+between joints. In this paper, we introduce an iterative graph filtering
+framework for 3D human pose estimation, which aims to predict the 3D joint
+positions given a set of 2D joint locations in images. Our approach builds upon
+the idea of iteratively solving graph filtering with Laplacian regularization
+via the Gauss-Seidel iterative method. Motivated by this iterative solution, we
+design a Gauss-Seidel network (GS-Net) architecture, which makes use of weight
+and adjacency modulation, skip connection, and a pure convolutional block with
+layer normalization. Adjacency modulation facilitates the learning of edges
+that go beyond the inherent connections of body joints, resulting in an
+adjusted graph structure that reflects the human skeleton, while skip
+connections help maintain crucial information from the input layer's initial
+features as the network depth increases. We evaluate our proposed model on two
+standard benchmark datasets, and compare it with a comprehensive set of strong
+baseline methods for 3D human pose estimation. Our experimental results
+demonstrate that our approach outperforms the baseline methods on both
+datasets, achieving state-of-the-art performance. Furthermore, we conduct
+ablation studies to analyze the contributions of different components of our
+model architecture and show that the skip connection and adjacency modulation
+help improve the model performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Contextual Data Augmentation for Generalizable Melanoma
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05116v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05116v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nick DiSanto, Gavin Harding, Ethan Martinez, Benjamin Sanders
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While skin cancer detection has been a valuable deep learning application for
+years, its evaluation has often neglected the context in which testing images
+are assessed. Traditional melanoma classifiers assume that their testing
+environments are comparable to the structured images they are trained on. This
+paper challenges this notion and argues that mole size, a critical attribute in
+professional dermatology, can be misleading in automated melanoma detection.
+While malignant melanomas tend to be larger than benign melanomas, relying
+solely on size can be unreliable and even harmful when contextual scaling of
+images is not possible. To address this issue, this implementation proposes a
+custom model that performs various data augmentation procedures to prevent
+overfitting to incorrect parameters and simulate real-world usage of melanoma
+detection applications. Multiple custom models employing different forms of
+data augmentation are implemented to highlight the most significant features of
+mole classifiers. These implementations emphasize the importance of considering
+user unpredictability when deploying such applications. The caution required
+when manually modifying data is acknowledged, as it can result in data loss and
+biased conclusions. Additionally, the significance of data augmentation in both
+the dermatology and deep learning communities is considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phase Matching for Out-of-Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12622v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12622v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Yeqian Du, Rui Wang, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier transform, serving as an explicit decomposition method for visual
+signals, has been employed to explain the out-of-distribution generalization
+behaviors of Convolutional Neural Networks (CNNs). Previous studies have
+indicated that the amplitude spectrum is susceptible to the disturbance caused
+by distribution shifts. On the other hand, the phase spectrum preserves
+highly-structured spatial information, which is crucial for robust visual
+representation learning. However, the spatial relationships of phase spectrum
+remain unexplored in previous researches. In this paper, we aim to clarify the
+relationships between Domain Generalization (DG) and the frequency components,
+and explore the spatial relationships of the phase spectrum. Specifically, we
+first introduce a Fourier-based structural causal model which interprets the
+phase spectrum as semi-causal factors and the amplitude spectrum as non-causal
+factors. Then, we propose Phase Matching (PhaMa) to address DG problems. Our
+method introduces perturbations on the amplitude spectrum and establishes
+spatial relationships to match the phase components. Through experiments on
+multiple benchmarks, we demonstrate that our proposed method achieves
+state-of-the-art performance in domain generalization and out-of-distribution
+robustness tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Review</span> of YOLO: From YOLOv1 and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00501v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00501v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Terven, Diana Cordova-Esparza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  YOLO has become a central real-time object detection system for robotics,
+driverless cars, and video monitoring applications. We present a comprehensive
+analysis of YOLO's evolution, examining the innovations and contributions in
+each iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with
+Transformers. We start by describing the standard metrics and postprocessing;
+then, we discuss the major changes in network architecture and training tricks
+for each model. Finally, we summarize the essential lessons from YOLO's
+development and provide a perspective on its future, highlighting potential
+research directions to enhance real-time object detection systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 19 figures, 4 tables, submitted to ACM Computing Surveys.
+  This version adds information about YOLO with transformers</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Labeling without Seeing? Blind Annotation for Privacy-Preserving Entity
+  Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiang Yao, Weizhao Jin, Srivatsan Ravi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The entity resolution problem requires finding pairs across datasets that
+belong to different owners but refer to the same entity in the real world. To
+train and evaluate solutions (either rule-based or machine-learning-based) to
+the entity resolution problem, generating a ground truth dataset with entity
+pairs or clusters is needed. However, such a data annotation process involves
+humans as domain oracles to review the plaintext data for all candidate record
+pairs from different parties, which inevitably infringes the privacy of data
+owners, especially in privacy-sensitive cases like medical records. To the best
+of our knowledge, there is no prior work on privacy-preserving ground truth
+dataset generation, especially in the domain of entity resolution. We propose a
+novel blind annotation protocol based on homomorphic encryption that allows
+domain oracles to collaboratively label ground truths without sharing data in
+plaintext with other parties. In addition, we design a domain-specific
+easy-to-use language that hides the sophisticated underlying homomorphic
+encryption layer. Rigorous proof of the privacy guarantee is provided and our
+empirical experiments via an annotation simulator indicate the feasibility of
+our privacy-preserving protocol (f-measure on average achieves more than 90\%
+compared with the real ground truths).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Graph Convolutional Network for Multimedia Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Penghang Yu, Zhiyi Tan, Guanming Lu, Bing-Kun Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommendation has received much attention in recent years. It
+models user preferences based on both behavior information and item multimodal
+information. Though current GCN-based methods achieve notable success, they
+suffer from two limitations: (1) Modality noise contamination to the item
+representations. Existing methods often mix modality features and behavior
+features in a single view (e.g., user-item view) for propagation, the noise in
+the modality features may be amplified and coupled with behavior features. In
+the end, it leads to poor feature discriminability; (2) Incomplete user
+preference modeling caused by equal treatment of modality features. Users often
+exhibit distinct modality preferences when purchasing different items. Equally
+fusing each modality feature ignores the relative importance among different
+modalities, leading to the suboptimal user preference modeling. To tackle the
+above issues, we propose a novel Multi-View Graph Convolutional Network for the
+multimedia recommendation. Specifically, to avoid modality noise contamination,
+the modality features are first purified with the aid of item behavior
+information. Then, the purified modality features of items and behavior
+features are enriched in separate views, including the user-item view and the
+item-item view. In this way, the distinguishability of features is enhanced.
+Meanwhile, a behavior-aware fuser is designed to comprehensively model user
+preferences by adaptively learning the relative importance of different
+modality features. Furthermore, we equip the fuser with a self-supervised
+auxiliary task. This task is expected to maximize the mutual information
+between the fused multimodal features and behavior features, so as to capture
+complementary and supplementary preference information simultaneously.
+Extensive experiments on three public datasets demonstrate the effectiveness of
+our methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TeraHAC: Hierarchical Agglomerative Clustering of Trillion-Edge Graphs <span class="chip">SIGMOD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laxman Dhulipala, Jason Lee, Jakub Łącki, Vahab Mirrokni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce TeraHAC, a $(1+\epsilon)$-approximate hierarchical agglomerative
+clustering (HAC) algorithm which scales to trillion-edge graphs. Our algorithm
+is based on a new approach to computing $(1+\epsilon)$-approximate HAC, which
+is a novel combination of the nearest-neighbor chain algorithm and the notion
+of $(1+\epsilon)$-approximate HAC. Our approach allows us to partition the
+graph among multiple machines and make significant progress in computing the
+clustering within each partition before any communication with other partitions
+is needed.
+  We evaluate TeraHAC on a number of real-world and synthetic graphs of up to 8
+trillion edges. We show that TeraHAC requires over 100x fewer rounds compared
+to previously known approaches for computing HAC. It is up to 8.3x faster than
+SCC, the state-of-the-art distributed algorithm for hierarchical clustering,
+while achieving 1.16x higher quality. In fact, TeraHAC essentially retains the
+quality of the celebrated HAC algorithm while significantly improving the
+running time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at SIGMOD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global cognitive graph properties dynamics of hippocampal formation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Sorokin, Andrey Zaitsew, Aleksandr Levin, German Magai, Maxim Beketov, Vladimir Sotskov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the present study we have used a set of methods and metrics to build a
+graph of relative neural connections in a hippocampus of a rodent. A set of
+graphs was built on top of time-sequenced data and analyzed in terms of
+dynamics of a connection genesis. The analysis has shown that during the
+process of a rodent exploring a novel environment, the relations between
+neurons constantly change which indicates that globally memory is constantly
+updated even for known areas of space. Even if some neurons gain cognitive
+specialization, the global network though remains relatively stable.
+Additionally we suggest a set of methods for building a graph of cognitive
+neural network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, paper for DAMDID 2023 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware Consistency Learning for Cold-Start Item
+  Recommendation <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taichi Liu, Chen Gao, Zhenyu Wang, Dong Li, Jianye Hao, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Network (GNN)-based models have become the mainstream approach
+for recommender systems. Despite the effectiveness, they are still suffering
+from the cold-start problem, i.e., recommend for few-interaction items.
+Existing GNN-based recommendation models to address the cold-start problem
+mainly focus on utilizing auxiliary features of users and items, leaving the
+user-item interactions under-utilized. However, embeddings distributions of
+cold and warm items are still largely different, since cold items' embeddings
+are learned from lower-popularity interactions, while warm items' embeddings
+are from higher-popularity interactions. Thus, there is a seesaw phenomenon,
+where the recommendation performance for the cold and warm items cannot be
+improved simultaneously. To this end, we proposed a Uncertainty-aware
+Consistency learning framework for Cold-start item recommendation (shorten as
+UCC) solely based on user-item interactions. Under this framework, we train the
+teacher model (generator) and student model (recommender) with consistency
+learning, to ensure the cold items with additionally generated low-uncertainty
+interactions can have similar distribution with the warm items. Therefore, the
+proposed framework improves the recommendation of cold and warm items at the
+same time, without hurting any one of them. Extensive experiments on benchmark
+datasets demonstrate that our proposed method significantly outperforms
+state-of-the-art methods on both warm and cold items, with an average
+performance improvement of 27.6%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Doubly Robust Estimator for Off-Policy Evaluation with Large Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large
+action spaces. The benchmark estimators suffer from severe bias and variance
+tradeoffs. Parametric approaches suffer from bias due to difficulty specifying
+the correct model, whereas ones with importance weight suffer from variance. To
+overcome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was
+proposed to mitigate the estimator's variance via embeddings of an action. To
+make the estimator more accurate, we propose the doubly robust estimator of
+MIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical
+analysis shows that the proposed estimator is unbiased under weaker assumptions
+than MIPS while maintaining variance reduction against IPS, which was the main
+advantage of MIPS. The empirical experiment verifies the supremacy of MDR
+against existing estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Contrastive Learning with Multiple Augmentation for
+  Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongjun Lee, Donggeun Ko, Jaekwang Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation addresses the issue of preference drift by
+predicting the next item based on the user's previous behaviors. Recently, a
+promising approach using contrastive learning has emerged, demonstrating its
+effectiveness in recommending items under sparse user-item interactions.
+Significantly, the effectiveness of combinations of various augmentation
+methods has been demonstrated in different domains, particularly in computer
+vision. However, when it comes to augmentation within a contrastive learning
+framework in sequential recommendation, previous research has only focused on
+limited conditions and simple structures. Thus, it is still possible to extend
+existing approaches to boost the effects of augmentation methods by using
+progressed structures with the combinations of multiple augmentation methods.
+In this work, we propose a novel framework called Hierarchical Contrastive
+Learning with Multiple Augmentation for Sequential Recommendation(HCLRec) to
+overcome the aforementioned limitation. Our framework leverages existing
+augmentation methods hierarchically to improve performance. By combining
+augmentation methods continuously, we generate low-level and high-level view
+pairs. We employ a Transformers-based model to encode the input sequence
+effectively. Furthermore, we introduce additional blocks consisting of
+Transformers and position-wise feed-forward network(PFFN) layers to learn the
+invariance of the original sequences from hierarchically augmented views. We
+pass the input sequence to subsequent layers based on the number of increment
+levels applied to the views to handle various augmentation levels. Within each
+layer, we compute contrastive loss between pairs of views at the same level.
+Extensive experiments demonstrate that our proposed method outperforms
+state-of-the-art approaches and that HCLRec is robust even when faced with the
+problem of sparse interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ POSIT: Promotion of Semantic Item Tail via Adversarial Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuling Xu, Pannaga Shivaswamy, Xiangyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many recommender problems, a handful of popular items (e.g. movies/TV
+shows, news etc.) can be dominant in recommendations for many users. However,
+we know that in a large catalog of items, users are likely interested in more
+than what is popular. The dominance of popular items may mean that users will
+not see items they would likely enjoy. In this paper, we propose a technique to
+overcome this problem using adversarial machine learning. We define a metric to
+translate user-level utility metric in terms of an advantage/disadvantage over
+items. We subsequently use that metric in an adversarial learning framework to
+systematically promote disadvantaged items. The resulting algorithm identifies
+semantically meaningful items that get promoted in the learning algorithm. In
+the empirical study, we evaluate the proposed technique on three publicly
+available datasets and four competitive baselines. The result shows that our
+proposed method not only improves the coverage, but also, surprisingly,
+improves the overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Knowledge Fusion: A Novel Approach for Personalized
+  Recommendation via LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Yin, Junjie Xie, Yu Qin, Zixiang Ding, Zhichao Feng, Xiang Li, Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The analysis and mining of user heterogeneous behavior are of paramount
+importance in recommendation systems. However, the conventional approach of
+incorporating various types of heterogeneous behavior into recommendation
+models leads to feature sparsity and knowledge fragmentation issues. To address
+this challenge, we propose a novel approach for personalized recommendation via
+Large Language Model (LLM), by extracting and fusing heterogeneous knowledge
+from user heterogeneous behavior information. In addition, by combining
+heterogeneous knowledge and recommendation tasks, instruction tuning is
+performed on LLM for personalized recommendations. The experimental results
+demonstrate that our method can effectively integrate user heterogeneous
+behavior and significantly improve recommendation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Equivalence of e-Commerce Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mandal, Daniel Tunkelang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query variation poses a challenge in e-commerce search, as equivalent
+search intents can be expressed through different queries with surface-level
+differences. This paper introduces a framework to recognize and leverage query
+equivalence to enhance searcher and business outcomes. The proposed approach
+addresses three key problems: mapping queries to vector representations of
+search intent, identifying nearest neighbor queries expressing equivalent or
+similar intent, and optimizing for user or business objectives. The framework
+utilizes both surface similarity and behavioral similarity to determine query
+equivalence. Surface similarity involves canonicalizing queries based on word
+inflection, word order, compounding, and noise words. Behavioral similarity
+leverages historical search behavior to generate vector representations of
+query intent. An offline process is used to train a sentence similarity model,
+while an online nearest neighbor approach supports processing of unseen
+queries. Experimental evaluations demonstrate the effectiveness of the proposed
+approach, outperforming popular sentence transformer models and achieving a
+Pearson correlation of 0.85 for query similarity. The results highlight the
+potential of leveraging historical behavior data and training models to
+recognize and utilize query equivalence in e-commerce search, leading to
+improved user experiences and business outcomes. Further advancements and
+benchmark datasets are encouraged to facilitate the development of solutions
+for this critical problem in the e-commerce domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mobile Supply: The Last Piece of Jigsaw of Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhao Jiang, Biao Zeng, Hao Feng, Jin Liu, Jie Zhang, Jia Jia, Ning Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation system is a fundamental functionality of online platforms.
+With the development of computing power of mobile phones, some researchers have
+deployed recommendation algorithms on users' devices to solve the problems of
+data transmission delay and pagination mechanism. However, the existing
+edge-side mobile rankings cannot completely solve the problem of pagination
+mechanism. The mobile rankings can only sort the items on the current page, so
+it will not work if it is called once or twice. Besides, after the user has
+viewed the items of interest to the user on the current page, the user refresh
+to get a new page of items. This will make the mobile ranking model do a lot of
+useless work and affect the user's immersive experience. In order to solve the
+pagination mechanism problem, we propose a completely new module in the
+pipeline of recommender named Mobile Supply. The pipeline of recommender system
+is extended to "retrival->pre-ranking->ranking->re-ranking->Mobile
+Supply->mobile ranking". Specifically, we introduce the concept of list value
+and use point-wise method to approximate list-wise estimation. We also design a
+new mobile ranking named device-aware mobile ranking considering the difference
+of mobile devices tailored to the new pipeline. Extensive offline and online
+experiments show the superiority of our proposed method and prove that Mobile
+Supply can further improve the performance of edge-side recommender system and
+user experience. Mobile Supply has been deployed on the homepage page of a
+large-scale online food platform and has yielded considerable profits in our
+business.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search Engine and Recommendation System for the Music Industry built
+  with JinaAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishita Gopalakrishnan, Sanjjushri Varshini R, Ponshriharini V
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most intriguing debates regarding a novel task is the development
+of search engines and recommendation-based systems in the music industry.
+Studies have shown a drastic depression in the search engine fields, due to
+concerning factors such as speed, accuracy and the format of data given for
+querying. Often people face difficulty in searching for a song solely based on
+the title, hence a solution is proposed to complete a search analysis through a
+single query input and is matched with the lyrics of the songs present in the
+database. Hence it is essential to incorporate cutting-edge technology tools
+for developing a user-friendly search engine. Jina AI is an MLOps framework for
+building neural search engines that are utilized, in order for the user to
+obtain accurate results. Jina AI effectively helps to maintain and enhance the
+quality of performance for the search engine for the query given. An effective
+search engine and a recommendation system for the music industry, built with
+JinaAI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">148</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Copycat Perceptron: Smashing Barriers Through Collective Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Catania, Aurélien Decelle, Beatriz Seoane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We characterize the equilibrium properties of a model of $y$ coupled binary
+perceptrons in the teacher-student scenario, subject to a suitable learning
+rule, with an explicit ferromagnetic coupling proportional to the Hamming
+distance between the students' weights. In contrast to recent works, we analyze
+a more general setting in which a thermal noise is present that affects the
+generalization performance of each student. Specifically, in the presence of a
+nonzero temperature, which assigns nonzero probability to configurations that
+misclassify samples with respect to the teacher's prescription, we find that
+the coupling of replicas leads to a shift of the phase diagram to smaller
+values of $\alpha$: This suggests that the free energy landscape gets smoother
+around the solution with good generalization (i.e., the teacher) at a fixed
+fraction of reviewed examples, which allows local update algorithms such as
+Simulated Annealing to reach the solution before the dynamics gets frozen.
+Finally, from a learning perspective, these results suggest that more students
+(in this case, with the same amount of data) are able to learn the same rule
+when coupled together with a smaller amount of data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized algorithms for precise measurement of differentially-private,
+  personalized recommendations <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allegra Laro, Yanqing Chen, Hao He, Babak Aghazadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized recommendations form an important part of today's internet
+ecosystem, helping artists and creators to reach interested users, and helping
+users to discover new and engaging content. However, many users today are
+skeptical of platforms that personalize recommendations, in part due to
+historically careless treatment of personal data and data privacy. Now,
+businesses that rely on personalized recommendations are entering a new
+paradigm, where many of their systems must be overhauled to be privacy-first.
+In this article, we propose an algorithm for personalized recommendations that
+facilitates both precise and differentially-private measurement. We consider
+advertising as an example application, and conduct offline experiments to
+quantify how the proposed privacy-preserving algorithm affects key metrics
+related to user experience, advertiser value, and platform revenue compared to
+the extremes of both (private) non-personalized and non-private, personalized
+implementations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to AAAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurvBeX: An explanation method of the machine learning survival models
+  based on the Beran estimator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lev V. Utkin, Danila Y. Eremenko, Andrei V. Konstantinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An explanation method called SurvBeX is proposed to interpret predictions of
+the machine learning survival black-box models. The main idea behind the method
+is to use the modified Beran estimator as the surrogate explanation model.
+Coefficients, incorporated into Beran estimator, can be regarded as values of
+the feature impacts on the black-box model prediction. Following the well-known
+LIME method, many points are generated in a local area around an example of
+interest. For every generated example, the survival function of the black-box
+model is computed, and the survival function of the surrogate model (the Beran
+estimator) is constructed as a function of the explanation coefficients. In
+order to find the explanation coefficients, it is proposed to minimize the mean
+distance between the survival functions of the black-box model and the Beran
+estimator produced by the generated examples. Many numerical experiments with
+synthetic and real survival data demonstrate the SurvBeX efficiency and compare
+the method with the well-known method SurvLIME. The method is also compared
+with the method SurvSHAP. The code implementing SurvBeX is available at:
+https://github.com/DanilaEremenko/SurvBeX
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dimensionality Reduction for Improving Out-of-Distribution Detection in
+  Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        McKell Woodland, Nihil Patel, Mais Al Taie, Joshua P. Yung, Tucker J. Netherton, Ankit B. Patel, Kristy K. Brock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinically deployed segmentation models are known to fail on data outside of
+their training distribution. As these models perform well on most cases, it is
+imperative to detect out-of-distribution (OOD) images at inference to protect
+against automation bias. This work applies the Mahalanobis distance post hoc to
+the bottleneck features of a Swin UNETR model that segments the liver on
+T1-weighted magnetic resonance imaging. By reducing the dimensions of the
+bottleneck features with principal component analysis, OOD images were detected
+with high performance and minimal computational load.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint has not undergone peer review or any post-submission
+  improvements or corrections. The Version of Record of this contribution will
+  be published in the Proceedings of Uncertainty for Safe Utilization of
+  Machine Learning in Medical Imaging (5th International Workshop) - Held in
+  conjunction with MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Framework for Distributed Image Semantic
+  Wireless Transmission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingyan Xie, Yongpeng Wu, Yuxuan Shi, Derrick Wing Kwan Ng, Wenjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-node communication, which refers to the interaction among multiple
+devices, has attracted lots of attention in many Internet-of-Things (IoT)
+scenarios. However, its huge amounts of data flows and inflexibility for task
+extension have triggered the urgent requirement of communication-efficient
+distributed data transmission frameworks. In this paper, inspired by the great
+superiorities on bandwidth reduction and task adaptation of semantic
+communications, we propose a federated learning-based semantic communication
+(FLSC) framework for multi-task distributed image transmission with IoT
+devices. Federated learning enables the design of independent semantic
+communication link of each user while further improves the semantic extraction
+and task performance through global aggregation. Each link in FLSC is composed
+of a hierarchical vision transformer (HVT)-based extractor and a task-adaptive
+translator for coarse-to-fine semantic extraction and meaning translation
+according to specific tasks. In order to extend the FLSC into more realistic
+conditions, we design a channel state information-based multiple-input
+multiple-output transmission module to combat channel fading and noise.
+Simulation results show that the coarse semantic information can deal with a
+range of image-level tasks. Moreover, especially in low signal-to-noise ratio
+and channel bandwidth ratio regimes, FLSC evidently outperforms the traditional
+scheme, e.g. about 10 peak signal-to-noise ratio gain in the 3 dB channel
+condition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Internet of Things Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling may be all you need for achieving human-level object recognition
+  capacity with human-like visual experience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Emin Orhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper asks whether current self-supervised learning methods, if
+sufficiently scaled up, would be able to reach human-level visual object
+recognition capabilities with the same type and amount of visual experience
+humans learn from. Previous work on this question only considered the scaling
+of data size. Here, we consider the simultaneous scaling of data size, model
+size, and image resolution. We perform a scaling experiment with vision
+transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K
+hours of human-like video data (long, continuous, mostly egocentric videos)
+with image resolutions of up to 476x476 pixels. The efficiency of masked
+autoencoders (MAEs) as a self-supervised learning algorithm makes it possible
+to run this scaling experiment on an unassuming academic budget. We find that
+it is feasible to reach human-level object recognition capacity at sub-human
+scales of model size, data size, and image size, if these factors are scaled up
+simultaneously. To give a concrete example, we estimate that a 2.5B parameter
+ViT model trained with 20K hours (2.3 years) of human-like video data with a
+spatial resolution of 952x952 pixels should be able to reach human-level
+accuracy on ImageNet. Human-level competence is thus achievable for a
+fundamental perceptual capability from human-like perceptual experience
+(human-like in both amount and type) with extremely generic learning algorithms
+and architectures and without any substantive inductive biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 tables; code & models available from
+  https://github.com/eminorhan/humanlike-vits</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeRisk: An Effective Deep Learning Framework for Credit Risk Prediction
+  over Real-World Financial Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yancheng Liang, Jiajie Zhang, Hui Li, Xiaochen Liu, Yi Hu, Yong Wu, Jinyao Zhang, Yongyan Liu, Yi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the tremendous advances achieved over the past years by deep learning
+techniques, the latest risk prediction models for industrial applications still
+rely on highly handtuned stage-wised statistical learning tools, such as
+gradient boosting and random forest methods. Different from images or
+languages, real-world financial data are high-dimensional, sparse, noisy and
+extremely imbalanced, which makes deep neural network models particularly
+challenging to train and fragile in practice. In this work, we propose DeRisk,
+an effective deep learning risk prediction framework for credit risk prediction
+on real-world financial data. DeRisk is the first deep risk prediction model
+that outperforms statistical learning approaches deployed in our company's
+production system. We also perform extensive ablation studies on our method to
+present the most critical factors for the empirical success of DeRisk.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentBench: Evaluating LLMs as Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Liu, Hao Yu, Hanchen Zhang, Yifan Xu, Xuanyu Lei, Hanyu Lai, Yu Gu, Hangliang Ding, Kaiwen Men, Kejuan Yang, Shudan Zhang, Xiang Deng, Aohan Zeng, Zhengxiao Du, Chenhui Zhang, Sheng Shen, Tianjun Zhang, Yu Su, Huan Sun, Minlie Huang, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are becoming increasingly smart and autonomous,
+targeting real-world pragmatic missions beyond traditional NLP tasks. As a
+result, there has been an urgent need to evaluate LLMs as agents on challenging
+tasks in interactive environments. We present AgentBench, a multi-dimensional
+evolving benchmark that currently consists of 8 distinct environments to assess
+LLM-as-Agent's reasoning and decision-making abilities in a multi-turn
+open-ended generation setting. Our extensive test over 25 LLMs (including APIs
+and open-sourced models) shows that, while top commercial LLMs present a strong
+ability of acting as agents in complex environments, there is a significant
+disparity in performance between them and open-sourced competitors. It also
+serves as a component of an ongoing project with wider coverage and deeper
+consideration towards systematic LLM evaluation. Datasets, environments, and an
+integrated evaluation package for AgentBench are released at
+https://github.com/THUDM/AgentBench
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Almost-sure convergence of iterates and multipliers in stochastic
+  sequential quadratic optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank E. Curtis, Xin Jiang, Qi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic sequential quadratic optimization (SQP) methods for solving
+continuous optimization problems with nonlinear equality constraints have
+attracted attention recently, such as for solving large-scale data-fitting
+problems subject to nonconvex constraints. However, for a recently proposed
+subclass of such methods that is built on the popular stochastic-gradient
+methodology from the unconstrained setting, convergence guarantees have been
+limited to the asymptotic convergence of the expected value of a stationarity
+measure to zero. This is in contrast to the unconstrained setting in which
+almost-sure convergence guarantees (of the gradient of the objective to zero)
+can be proved for stochastic-gradient-based methods. In this paper, new
+almost-sure convergence guarantees for the primal iterates, Lagrange
+multipliers, and stationarity measures generated by a stochastic SQP algorithm
+in this subclass of methods are proved. It is shown that the error in the
+Lagrange multipliers can be bounded by the distance of the primal iterate to a
+primal stationary point plus the error in the latest stochastic gradient
+estimate. It is further shown that, subject to certain assumptions, this latter
+error can be made to vanish by employing a running average of the Lagrange
+multipliers that are computed during the run of the algorithm. The results of
+numerical experiments are provided to demonstrate the proved theoretical
+guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Convergence Bounds for Diffusion Models via Stochastic
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joe Benton, Valentin De Bortoli, Arnaud Doucet, George Deligiannidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are a powerful method for generating approximate samples
+from high-dimensional data distributions. Several recent results have provided
+polynomial bounds on the convergence rate of such models, assuming
+$L^2$-accurate score estimators. However, up until now the best known such
+bounds were either superlinear in the data dimension or required strong
+smoothness assumptions. We provide the first convergence bounds which are
+linear in the data dimension (up to logarithmic factors) assuming only finite
+second moments of the data distribution. We show that diffusion models require
+at most $\tilde O(\frac{d \log^2(1/\delta)}{\varepsilon^2})$ steps to
+approximate an arbitrary data distribution on $\mathbb{R}^d$ corrupted with
+Gaussian noise of variance $\delta$ to within $\varepsilon^2$ in
+Kullback--Leibler divergence. Our proof builds on the Girsanov-based methods of
+previous works. We introduce a refined treatment of the error arising from the
+discretization of the reverse SDE, which is based on tools from stochastic
+localization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving FHB Screening in Wheat Breeding Using an Efficient <span class="highlight-title">Transformer</span>
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Babak Azad, Ahmed Abdalla, Kwanghee Won, Ali Mirzakhani Nafchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fusarium head blight is a devastating disease that causes significant
+economic losses annually on small grains. Efficiency, accuracy, and timely
+detection of FHB in the resistance screening are critical for wheat and barley
+breeding programs. In recent years, various image processing techniques have
+been developed using supervised machine learning algorithms for the early
+detection of FHB. The state-of-the-art convolutional neural network-based
+methods, such as U-Net, employ a series of encoding blocks to create a local
+representation and a series of decoding blocks to capture the semantic
+relations. However, these methods are not often capable of long-range modeling
+dependencies inside the input data, and their ability to model multi-scale
+objects with significant variations in texture and shape is limited. Vision
+transformers as alternative architectures with innate global self-attention
+mechanisms for sequence-to-sequence prediction, due to insufficient low-level
+details, may also limit localization capabilities. To overcome these
+limitations, a new Context Bridge is proposed to integrate the local
+representation capability of the U-Net network in the transformer model. In
+addition, the standard attention mechanism of the original transformer is
+replaced with Efficient Self-attention, which is less complicated than other
+state-of-the-art methods. To train the proposed network, 12,000 wheat images
+from an FHB-inoculated wheat field at the SDSU research farm in Volga, SD, were
+captured. In addition to healthy and unhealthy plants, these images encompass
+various stages of the disease. A team of expert pathologists annotated the
+images for training and evaluating the developed model. As a result, the
+effectiveness of the transformer-based method for FHB-disease detection,
+through extensive experiments across typical tasks for plant image
+segmentation, is demonstrated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, 1 table. Presented at the 2023 ASABE Annual
+  International Meeting conference in Omaha, Nebraska. Also available at
+  https://elibrary.asabe.org/abstract.asp?aid=54149</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Model in Causal Inference with Unmeasured Confounders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study how to extend the use of the diffusion model to answer the causal
+question from the observational data under the existence of unmeasured
+confounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to
+capture the causal intervention, a Diffusion-based Causal Model (DCM) was
+proposed incorporating the diffusion model to answer the causal questions more
+accurately, assuming that all of the confounders are observed. However,
+unmeasured confounders in practice exist, which hinders DCM from being
+applicable. To alleviate this limitation of DCM, we propose an extended model
+called Backdoor Criterion based DCM (BDCM), whose idea is rooted in the
+Backdoor criterion to find the variables in DAG to be included in the decoding
+process of the diffusion model so that we can extend DCM to the case with
+unmeasured confounders. Synthetic data experiment demonstrates that our
+proposed model captures the counterfactual distribution more precisely than DCM
+under the unmeasured confounders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging Trustworthiness and Open-World Learning: An Exploratory Neural
+  Approach for Enhancing Interpretability, Generalization, and Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shide Du, Zihan Fang, Shiyang Lan, Yanchao Tan, Manuel Günther, Shiping Wang, Wenzhong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As researchers strive to narrow the gap between machine intelligence and
+human through the development of artificial intelligence technologies, it is
+imperative that we recognize the critical importance of trustworthiness in
+open-world, which has become ubiquitous in all aspects of daily life for
+everyone. However, several challenges may create a crisis of trust in current
+artificial intelligence systems that need to be bridged: 1) Insufficient
+explanation of predictive results; 2) Inadequate generalization for learning
+models; 3) Poor adaptability to uncertain environments. Consequently, we
+explore a neural program to bridge trustworthiness and open-world learning,
+extending from single-modal to multi-modal scenarios for readers. 1) To enhance
+design-level interpretability, we first customize trustworthy networks with
+specific physical meanings; 2) We then design environmental well-being
+task-interfaces via flexible learning regularizers for improving the
+generalization of trustworthy learning; 3) We propose to increase the
+robustness of trustworthy learning by integrating open-world recognition losses
+with agent mechanisms. Eventually, we enhance various trustworthy properties
+through the establishment of design-level explainability, environmental
+well-being task-interfaces and open-world recognition programs. These designed
+open-world protocols are applicable across a wide range of surroundings, under
+open-world multimedia recognition scenarios with significant performance
+improvements observed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two-stage Early Prediction Framework of Remaining Useful Life for
+  Lithium-ion Batteries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruv Mittal, Hymalai Bello, Bo Zhou, Mayank Shekhar Jha, Sungho Suh, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early prediction of remaining useful life (RUL) is crucial for effective
+battery management across various industries, ranging from household appliances
+to large-scale applications. Accurate RUL prediction improves the reliability
+and maintainability of battery technology. However, existing methods have
+limitations, including assumptions of data from the same sensors or
+distribution, foreknowledge of the end of life (EOL), and neglect to determine
+the first prediction cycle (FPC) to identify the start of the unhealthy stage.
+This paper proposes a novel method for RUL prediction of Lithium-ion batteries.
+The proposed framework comprises two stages: determining the FPC using a neural
+network-based model to divide the degradation data into distinct health states
+and predicting the degradation pattern after the FPC to estimate the remaining
+useful life as a percentage. Experimental results demonstrate that the proposed
+method outperforms conventional approaches in terms of RUL prediction.
+Furthermore, the proposed method shows promise for real-world scenarios,
+providing improved accuracy and applicability for battery management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 49th Annual Conference of the IEEE Industrial
+  Electronics Society (IECON 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matrix Completion in Almost-Verification Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan A. Kelner, Jerry Li, Allen Liu, Aaron Sidford, Kevin Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We give a new framework for solving the fundamental problem of low-rank
+matrix completion, i.e., approximating a rank-$r$ matrix $\mathbf{M} \in
+\mathbb{R}^{m \times n}$ (where $m \ge n$) from random observations. First, we
+provide an algorithm which completes $\mathbf{M}$ on $99\%$ of rows and columns
+under no further assumptions on $\mathbf{M}$ from $\approx mr$ samples and
+using $\approx mr^2$ time. Then, assuming the row and column spans of
+$\mathbf{M}$ satisfy additional regularity properties, we show how to boost
+this partial completion guarantee to a full matrix completion algorithm by
+aggregating solutions to regression problems involving the observations.
+  In the well-studied setting where $\mathbf{M}$ has incoherent row and column
+spans, our algorithms complete $\mathbf{M}$ to high precision from
+$mr^{2+o(1)}$ observations in $mr^{3 + o(1)}$ time (omitting logarithmic
+factors in problem parameters), improving upon the prior state-of-the-art
+[JN15] which used $\approx mr^5$ samples and $\approx mr^7$ time. Under an
+assumption on the row and column spans of $\mathbf{M}$ we introduce (which is
+satisfied by random subspaces with high probability), our sample complexity
+improves to an almost information-theoretically optimal $mr^{1 + o(1)}$, and
+our runtime improves to $mr^{2 + o(1)}$. Our runtimes have the appealing
+property of matching the best known runtime to verify that a rank-$r$
+decomposition $\mathbf{U}\mathbf{V}^\top$ agrees with the sampled observations.
+We also provide robust variants of our algorithms that, given random
+observations from $\mathbf{M} + \mathbf{N}$ with $\|\mathbf{N}\|_{F} \le
+\Delta$, complete $\mathbf{M}$ to Frobenius norm distance $\approx
+r^{1.5}\Delta$ in the same runtimes as the noiseless setting. Prior noisy
+matrix completion algorithms [CP10] only guaranteed a distance of $\approx
+\sqrt{n}\Delta$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FOCS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Nock, Mathieu Guillame-Bert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular data represents one of the most prevalent form of data. When it comes
+to data generation, many approaches would learn a density for the data
+generation process, but would not necessarily end up with a sampler, even less
+so being exact with respect to the underlying density. A second issue is on
+models: while complex modeling based on neural nets thrives in image or text
+generation (etc.), less is known for powerful generative models on tabular
+data. A third problem is the visible chasm on tabular data between training
+algorithms for supervised learning with remarkable properties (e.g. boosting),
+and a comparative lack of guarantees when it comes to data generation. In this
+paper, we tackle the three problems, introducing new tree-based generative
+models convenient for density modeling and tabular data generation that improve
+on modeling capabilities of recent proposals, and a training algorithm which
+simplifies the training setting of previous approaches and displays
+boosting-compliant convergence. This algorithm has the convenient property to
+rely on a supervised training scheme that can be implemented by a few tweaks to
+the most popular induction scheme for decision tree induction with two classes.
+Experiments are provided on missing data imputation and comparing generated
+data to real data, displaying the quality of the results obtained by our
+approach, in particular against state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MedMine: Examining <span class="highlight-title">Pre-train</span>ed Language Models on Medication Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haifa Alrdahi, Lifeng Han, Hendrik Šuvalov, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic medication mining from clinical and biomedical text has become a
+popular topic due to its real impact on healthcare applications and the recent
+development of powerful language models (LMs). However, fully-automatic
+extraction models still face obstacles to be overcome such that they can be
+deployed directly into clinical practice for better impacts. Such obstacles
+include their imbalanced performances on different entity types and clinical
+events. In this work, we examine current state-of-the-art pre-trained language
+models (PLMs) on such tasks, via fine-tuning including the monolingual model
+Med7 and multilingual large language model (LLM) XLM-RoBERTa. We compare their
+advantages and drawbacks using historical medication mining shared task data
+sets from n2c2-2018 challenges. We report the findings we get from these
+fine-tuning experiments such that they can facilitate future research on
+addressing them, for instance, how to combine their outputs, merge such models,
+or improve their overall accuracy by ensemble learning and data augmentation.
+MedMine is part of the M3 Initiative \url{https://github.com/HECTA-UoM/M3}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Open Research Project. 7 pages, 1 figure, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Semi-Supervised Segmentation of Brain Vessels with Ambiguous
+  Labels <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengming Lin, Yan Xia, Nishant Ravikumar, Qiongyao Liu, Michael MacRaild, Alejandro F Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of brain vessels is crucial for cerebrovascular disease
+diagnosis and treatment. However, existing methods face challenges in capturing
+small vessels and handling datasets that are partially or ambiguously
+annotated. In this paper, we propose an adaptive semi-supervised approach to
+address these challenges. Our approach incorporates innovative techniques
+including progressive semi-supervised learning, adaptative training strategy,
+and boundary enhancement. Experimental results on 3DRA datasets demonstrate the
+superiority of our method in terms of mesh-based segmentation metrics. By
+leveraging the partially and ambiguously labeled data, which only annotates the
+main vessels, our method achieves impressive segmentation performance on
+mislabeled fine vessels, showcasing its potential for clinical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by DALI MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Early Stopping in Evolutionary Direct Policy Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etor Arza, Leni K. Le Goff, Emma Hart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lengthy evaluation times are common in many optimization problems such as
+direct policy search tasks, especially when they involve conducting evaluations
+in the physical world, e.g. in robotics applications. Often, when evaluating a
+solution over a fixed time period, it becomes clear that the objective value
+will not increase with additional computation time (for example, when a
+two-wheeled robot continuously spins on the spot). In such cases, it makes
+sense to stop the evaluation early to save computation time. However, most
+approaches to stop the evaluation are problem-specific and need to be
+specifically designed for the task at hand. Therefore, we propose an early
+stopping method for direct policy search. The proposed method only looks at the
+objective value at each time step and requires no problem-specific knowledge.
+  We test the introduced stopping criterion in five direct policy search
+environments drawn from games, robotics, and classic control domains, and show
+that it can save up to 75% of the computation time. We also compare it with
+problem-specific stopping criteria and demonstrate that it performs comparably
+while being more generally applicable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Federated Learning meets Watermarking: A Comprehensive <span class="highlight-title">Overview</span> of
+  Techniques for Intellectual Property Protection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Lansari, Reda Bellafqira, Katarzyna Kapusta, Vincent Thouvenot, Olivier Bettan, Gouenou Coatrieux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a technique that allows multiple participants to
+collaboratively train a Deep Neural Network (DNN) without the need of
+centralizing their data. Among other advantages, it comes with
+privacy-preserving properties making it attractive for application in sensitive
+contexts, such as health care or the military. Although the data are not
+explicitly exchanged, the training procedure requires sharing information about
+participants' models. This makes the individual models vulnerable to theft or
+unauthorized distribution by malicious actors. To address the issue of
+ownership rights protection in the context of Machine Learning (ML), DNN
+Watermarking methods have been developed during the last five years. Most
+existing works have focused on watermarking in a centralized manner, but only a
+few methods have been designed for FL and its unique constraints. In this
+paper, we provide an overview of recent advancements in Federated Learning
+watermarking, shedding light on the new challenges and opportunities that arise
+in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2figures, 14pages, 3tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably Efficient Learning in Partially Observable Contextual Bandit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueping Gong, Jiheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate transfer learning in partially observable
+contextual bandits, where agents have limited knowledge from other agents and
+partial information about hidden confounders. We first convert the problem to
+identifying or partially identifying causal effects between actions and rewards
+through optimization problems. To solve these optimization problems, we
+discretize the original functional constraints of unknown distributions into
+linear constraints, and sample compatible causal models via sequentially
+solving linear programmings to obtain causal bounds with the consideration of
+estimation error. Our sampling algorithms provide desirable convergence results
+for suitable sampling distributions. We then show how causal bounds can be
+applied to improving classical bandit algorithms and affect the regrets with
+respect to the size of action sets and function spaces. Notably, in the task
+with function approximation which allows us to handle general context
+distributions, our method improves the order dependence on function space size
+compared with previous literatures. We formally prove that our causally
+enhanced algorithms outperform classical bandit algorithms and achieve orders
+of magnitude faster convergence rates. Finally, we perform simulations that
+demonstrate the efficiency of our strategy compared to the current
+state-of-the-art methods. This research has the potential to enhance the
+performance of contextual bandit agents in real-world applications where data
+is scarce and costly to obtain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2010.03104 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Partial identification of kernel based two sample tests with mismeasured
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ron Nafshi, Maggie Makar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonparametric two-sample tests such as the Maximum Mean Discrepancy (MMD) are
+often used to detect differences between two distributions in machine learning
+applications. However, the majority of existing literature assumes that
+error-free samples from the two distributions of interest are available.We
+relax this assumption and study the estimation of the MMD under
+$\epsilon$-contamination, where a possibly non-random $\epsilon$ proportion of
+one distribution is erroneously grouped with the other. We show that under
+$\epsilon$-contamination, the typical estimate of the MMD is unreliable.
+Instead, we study partial identification of the MMD, and characterize sharp
+upper and lower bounds that contain the true, unknown MMD. We propose a method
+to estimate these bounds, and show that it gives estimates that converge to the
+sharpest possible bounds on the MMD as sample size increases, with a
+convergence rate that is faster than alternative approaches. Using three
+datasets, we empirically validate that our approach is superior to the
+alternatives: it gives tight bounds with a low false coverage rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Transfer Learning Framework for Proactive Ramp Metering Performance
+  Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobo Ma, Adrian Cottam, Mohammad Razaur Rahman Shaon, Yao-Jan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transportation agencies need to assess ramp metering performance when
+deploying or expanding a ramp metering system. The evaluation of a ramp
+metering strategy is primarily centered around examining its impact on freeway
+traffic mobility. One way these effects can be explored is by comparing traffic
+states, such as the speed before and after the ramp metering strategy has been
+altered. Predicting freeway traffic states for the after scenarios following
+the implementation of a new ramp metering control strategy could offer valuable
+insights into the potential effectiveness of the target strategy. However, the
+use of machine learning methods in predicting the freeway traffic state for the
+after scenarios and evaluating the effectiveness of transportation policies or
+traffic control strategies such as ramp metering is somewhat limited in the
+current literature. To bridge the research gap, this study presents a framework
+for predicting freeway traffic parameters (speed, occupancy, and flow rate) for
+the after situations when a new ramp metering control strategy is implemented.
+By learning the association between the spatial-temporal features of traffic
+states in before and after situations for known freeway segments, the proposed
+framework can transfer this learning to predict the traffic parameters for new
+freeway segments. The proposed framework is built upon a transfer learning
+model. Experimental results show that the proposed framework is feasible for
+use as an alternative for predicting freeway traffic parameters to proactively
+evaluate ramp metering performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On-ramp and Off-ramp Traffic Flows Estimation Based on A Data-driven
+  Transfer Learning Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobo Ma, Abolfazl Karimpour, Yao-Jan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To develop the most appropriate control strategy and monitor, maintain, and
+evaluate the traffic performance of the freeway weaving areas, state and local
+Departments of Transportation need to have access to traffic flows at each pair
+of on-ramp and off-ramp. However, ramp flows are not always readily available
+to transportation agencies and little effort has been made to estimate these
+missing flows in locations where no physical sensors are installed. To bridge
+this research gap, a data-driven framework is proposed that can accurately
+estimate the missing ramp flows by solely using data collected from loop
+detectors on freeway mainlines. The proposed framework employs a transfer
+learning model. The transfer learning model relaxes the assumption that the
+underlying data distributions of the source and target domains must be the
+same. Therefore, the proposed framework can guarantee high-accuracy estimation
+of on-ramp and off-ramp flows on freeways with different traffic patterns,
+distributions, and characteristics. Based on the experimental results, the flow
+estimation mean absolute errors range between 23.90 veh/h to 40.85 veh/h for
+on-ramps, and 31.58 veh/h to 45.31 veh/h for off-ramps; the flow estimation
+root mean square errors range between 34.55 veh/h to 57.77 veh/h for on-ramps,
+and 41.75 veh/h to 58.80 veh/h for off-ramps. Further, the comparison analysis
+shows that the proposed framework outperforms other conventional machine
+learning models. The estimated ramp flows based on the proposed method can help
+transportation agencies to enhance the operations of their ramp control
+strategies for locations where physical sensors are not installed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Feature Learning for Wireless Spectrum Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ljupcho Milosheski, Gregor Cerar, Blaž Bertalanič, Carolina Fortuna, Mihael Mohorčič
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the traditional feature engineering process for training
+machine learning models is being automated by the feature extraction layers
+integrated in deep learning architectures. In wireless networks, many studies
+were conducted in automatic learning of feature representations for
+domain-related challenges. However, most of the existing works assume some
+supervision along the learning process by using labels to optimize the model.
+In this paper, we investigate an approach to learning feature representations
+for wireless transmission clustering in a completely unsupervised manner, i.e.
+requiring no labels in the process. We propose a model based on convolutional
+neural networks that automatically learns a reduced dimensionality
+representation of the input data with 99.3% less components compared to a
+baseline principal component analysis (PCA). We show that the automatic
+representation learning is able to extract fine-grained clusters containing the
+shapes of the wireless transmission bursts, while the baseline enables only
+general separability of the data based on the background noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michaël Mathieu, Sherjil Ozair, Srivatsan Srinivasan, Caglar Gulcehre, Shangtong Zhang, Ray Jiang, Tom Le Paine, Richard Powell, Konrad Żołna, Julian Schrittwieser, David Choi, Petko Georgiev, Daniel Toyama, Aja Huang, Roman Ring, Igor Babuschkin, Timo Ewalds, Mahyar Bordbar, Sarah Henderson, Sergio Gómez Colmenarejo, Aäron van den Oord, Wojciech Marian Czarnecki, Nando de Freitas, Oriol Vinyals
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  StarCraft II is one of the most challenging simulated reinforcement learning
+environments; it is partially observable, stochastic, multi-agent, and
+mastering StarCraft II requires strategic planning over long time horizons with
+real-time low-level execution. It also has an active professional competitive
+scene. StarCraft II is uniquely suited for advancing offline RL algorithms,
+both because of its challenging nature and because Blizzard has released a
+massive dataset of millions of StarCraft II games played by human players. This
+paper leverages that and establishes a benchmark, called AlphaStar Unplugged,
+introducing unprecedented challenges for offline reinforcement learning. We
+define a dataset (a subset of Blizzard's release), tools standardizing an API
+for machine learning methods, and an evaluation protocol. We also present
+baseline agents, including behavior cloning, offline variants of actor-critic
+and MuZero. We improve the state of the art of agents using only offline data,
+and we achieve 90% win rate against previously published AlphaStar behavior
+cloning agent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 13 figures, previous version published as a NeurIPS 2021
+  workshop: https://openreview.net/forum?id=Np8Pumfoty</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Worker Activity Recognition in Manufacturing Line Using Near-body
+  Electric Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungho Suh, Vitor Fortes Rey, Sizhen Bian, Yu-Chi Huang, Jože M. Rožanec, Hooman Tavakoli Ghinani, Bo Zhou, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manufacturing industries strive to improve production efficiency and product
+quality by deploying advanced sensing and control systems. Wearable sensors are
+emerging as a promising solution for achieving this goal, as they can provide
+continuous and unobtrusive monitoring of workers' activities in the
+manufacturing line. This paper presents a novel wearable sensing prototype that
+combines IMU and body capacitance sensing modules to recognize worker
+activities in the manufacturing line. To handle these multimodal sensor data,
+we propose and compare early, and late sensor data fusion approaches for
+multi-channel time-series convolutional neural networks and deep convolutional
+LSTM. We evaluate the proposed hardware and neural network model by collecting
+and annotating sensor data using the proposed sensing prototype and Apple
+Watches in the testbed of the manufacturing line. Experimental results
+demonstrate that our proposed methods achieve superior performance compared to
+the baseline methods, indicating the potential of the proposed approach for
+real-world applications in manufacturing industries. Furthermore, the proposed
+sensing prototype with a body capacitive sensor and feature fusion method
+improves by 6.35%, yielding a 9.38% higher macro F1 score than the proposed
+sensing prototype without a body capacitive sensor and Apple Watch data,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A data-driven approach to predict decision point choice during normal
+  and evacuation wayfinding in multi-story buildings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Feng, Panchamy Krishnakumari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding pedestrian route choice behavior in complex buildings is
+important to ensure pedestrian safety. Previous studies have mostly used
+traditional data collection methods and discrete choice modeling to understand
+the influence of different factors on pedestrian route and exit choice,
+particularly in simple indoor environments. However, research on pedestrian
+route choice in complex buildings is still limited. This paper presents a
+data-driven approach for understanding and predicting the pedestrian decision
+point choice during normal and emergency wayfinding in a multi-story building.
+For this, we first built an indoor network representation and proposed a data
+mapping technique to map VR coordinates to the indoor representation. We then
+used a well-established machine learning algorithm, namely the random forest
+(RF) model to predict pedestrian decision point choice along a route during
+four wayfinding tasks in a multi-story building. Pedestrian behavioral data in
+a multi-story building was collected by a Virtual Reality experiment. The
+results show a much higher prediction accuracy of decision points using the RF
+model (i.e., 93% on average) compared to the logistic regression model. The
+highest prediction accuracy was 96% for task 3. Additionally, we tested the
+model performance combining personal characteristics and we found that personal
+characteristics did not affect decision point choice. This paper demonstrates
+the potential of applying a machine learning algorithm to study pedestrian
+route choice behavior in complex indoor buildings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balanced Face <span class="highlight-title">Dataset</span>: Guiding StyleGAN to Generate Labeled Synthetic
+  Face Image <span class="highlight-title">Dataset</span> for Underrepresented Group 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a machine learning model to generalize effectively to unseen data within
+a particular problem domain, it is well-understood that the data needs to be of
+sufficient size and representative of real-world scenarios. Nonetheless,
+real-world datasets frequently have overrepresented and underrepresented
+groups. One solution to mitigate bias in machine learning is to leverage a
+diverse and representative dataset. Training a model on a dataset that covers
+all demographics is crucial to reducing bias in machine learning. However,
+collecting and labeling large-scale datasets has been challenging, prompting
+the use of synthetic data generation and active labeling to decrease the costs
+of manual labeling. The focus of this study was to generate a robust face image
+dataset using the StyleGAN model. In order to achieve a balanced distribution
+of the dataset among different demographic groups, a synthetic dataset was
+created by controlling the generation process of StyleGaN and annotated for
+different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures,submitted to AMLD Africa 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Physical World Adversarial Robustness of Vehicle Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Tianyuan Zhang, Shuangcheng Liu, Weiyu Ji, Zichao Zhang, Gang Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks can compromise the robustness of real-world detection
+models. However, evaluating these models under real-world conditions poses
+challenges due to resource-intensive experiments. Virtual simulations offer an
+alternative, but the absence of standardized benchmarks hampers progress.
+Addressing this, we propose an innovative instant-level data generation
+pipeline using the CARLA simulator. Through this pipeline, we establish the
+Discrete and Continuous Instant-level (DCI) dataset, enabling comprehensive
+experiments involving three detection models and three physical adversarial
+attacks. Our findings highlight diverse model performances under adversarial
+conditions. Yolo v6 demonstrates remarkable resilience, experiencing just a
+marginal 6.59% average drop in average precision (AP). In contrast, the ASA
+attack yields a substantial 14.51% average AP reduction, twice the effect of
+other algorithms. We also note that static scenes yield higher recognition AP
+values, and outcomes remain relatively consistent across varying weather
+conditions. Intriguingly, our study suggests that advancements in adversarial
+attack algorithms may be approaching its ``limitation''.In summary, our work
+underscores the significance of adversarial attacks in real-world contexts and
+introduces the DCI dataset as a versatile benchmark. Our findings provide
+valuable insights for enhancing the robustness of detection models and offer
+guidance for future research endeavors in the realm of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to forecast power generation in wind farms? Insights from leveraging
+  hierarchical structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas English, Mahdi Abolghasemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting of renewable energy generation provides key insights which may
+help with decision-making towards global decarbonisation. Renewable energy
+generation can often be represented through cross-sectional hierarchies,
+whereby a single farm may have multiple individual generators. Hierarchical
+forecasting through reconciliation has demonstrated a significant increase in
+the quality of forecasts both theoretically and empirically. However, it is not
+evident whether forecasts generated by individual temporal and cross-sectional
+aggregation can be superior to integrated cross-temporal forecasts and to
+individual forecasts on more granular data. In this study, we investigate the
+accuracies of different cross-sectional and cross-temporal reconciliation
+methods using both linear regression and gradient boosting machine learning for
+forecasting wind farm power generation. We found that cross-temporal
+reconciliation is superior to individual cross-sectional reconciliation at
+multiple temporal aggregations. Cross-temporally reconciled machine learning
+base forecasts also demonstrated a high accuracy at coarser temporal
+granularities, which may encourage adoption for short-term wind forecasts. We
+also show that linear regression can outperform machine learning models across
+most levels in cross-sectional wind time series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wide Gaps and Clustering Axioms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mieczysław A. Kłopotek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widely applied k-means algorithm produces clusterings that violate our
+expectations with respect to high/low similarity/density and is in conflict
+with Kleinberg's axiomatic system for distance based clustering algorithms that
+formalizes those expectations in a natural way. k-means violates in particular
+the consistency axiom. We hypothesise that this clash is due to the not
+explicated expectation that the data themselves should have the property of
+being clusterable in order to expect the algorithm clustering hem to fit a
+clustering axiomatic system. To demonstrate this, we introduce two new
+clusterability properties, variational k-separability and residual
+k-separability and show that then the Kleinberg's consistency axiom holds for
+k-means operating in the Euclidean or non-Euclidean space. Furthermore, we
+propose extensions of k-means algorithm that fit approximately the Kleinberg's
+richness axiom that does not hold for k-means. In this way, we reconcile
+k-means with Kleinberg's axiomatic framework in Euclidean and non-Euclidean
+settings. Besides contribution to the theory of axiomatic frameworks of
+clustering and for clusterability theory, practical contribution is the
+possibility to construct {datasets for testing purposes of algorithms
+optimizing k-means cost function. This includes a method of construction of
+{clusterable data with known in advance global optimum.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 Theorems. arXiv admin note: substantial text overlap with
+  arXiv:2211.17036</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Silo Prototypical Calibration for Federated Learning with Non-IID
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuang Qi, Lei Meng, Zitan Chen, Han Hu, Hui Lin, Xiangxu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning aims to learn a global model on the server side that
+generalizes to all clients in a privacy-preserving manner, by leveraging the
+local models from different clients. Existing solutions focus on either
+regularizing the objective functions among clients or improving the aggregation
+mechanism for the improved model generalization capability. However, their
+performance is typically limited by the dataset biases, such as the
+heterogeneous data distributions and the missing classes. To address this
+issue, this paper presents a cross-silo prototypical calibration method
+(FedCSPC), which takes additional prototype information from the clients to
+learn a unified feature space on the server side. Specifically, FedCSPC first
+employs the Data Prototypical Modeling (DPM) module to learn data patterns via
+clustering to aid calibration. Subsequently, the cross-silo prototypical
+calibration (CSPC) module develops an augmented contrastive learning method to
+improve the robustness of the calibration, which can effectively project
+cross-source features into a consistent space while maintaining clear decision
+boundaries. Moreover, the CSPC module's ease of implementation and
+plug-and-play characteristics make it even more remarkable. Experiments were
+conducted on four datasets in terms of performance comparison, ablation study,
+in-depth analysis and case study, and the results verified that FedCSPC is
+capable of learning the consistent features across different data sources of
+the same class under the guidance of calibrated model, which leads to better
+performance than the state-of-the-art methods. The source codes have been
+released at https://github.com/qizhuang-qz/FedCSPC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Doubly Robust Estimator for Off-Policy Evaluation with Large Action
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuhiro Shimizu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Off-Policy Evaluation (OPE) in contextual bandit settings with large
+action spaces. The benchmark estimators suffer from severe bias and variance
+tradeoffs. Parametric approaches suffer from bias due to difficulty specifying
+the correct model, whereas ones with importance weight suffer from variance. To
+overcome these limitations, Marginalized Inverse Propensity Scoring (MIPS) was
+proposed to mitigate the estimator's variance via embeddings of an action. To
+make the estimator more accurate, we propose the doubly robust estimator of
+MIPS called the Marginalized Doubly Robust (MDR) estimator. Theoretical
+analysis shows that the proposed estimator is unbiased under weaker assumptions
+than MIPS while maintaining variance reduction against IPS, which was the main
+advantage of MIPS. The empirical experiment verifies the supremacy of MDR
+against existing estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PURL: Safe and Effective Sanitization of Link Decoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoor Munir, Patrick Lee, Umar Iqbal, Zubair Shafiq, Sandra Siby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While privacy-focused browsers have taken steps to block third-party cookies
+and browser fingerprinting, novel tracking methods that bypass existing
+defenses continue to emerge. Since trackers need to exfiltrate information from
+the client- to server-side through link decoration regardless of the tracking
+technique they employ, a promising orthogonal approach is to detect and
+sanitize tracking information in decorated links. We present PURL, a
+machine-learning approach that leverages a cross-layer graph representation of
+webpage execution to safely and effectively sanitize link decoration. Our
+evaluation shows that PURL significantly outperforms existing countermeasures
+in terms of accuracy and reducing website breakage while being robust to common
+evasion techniques. We use PURL to perform a measurement study on top-million
+websites. We find that link decorations are widely abused by well-known
+advertisers and trackers to exfiltrate user information collected from browser
+storage, email addresses, and scripts involved in fingerprinting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applied metamodelling for ATM performance simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoffer Riis, Francisco N. Antunes, Tatjana Bolić, Gérald Gurtner, Andrew Cook, Carlos Lima Azevedo, Francisco Câmara Pereira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of Air traffic management (ATM) simulators for planing and operations
+can be challenging due to their modelling complexity. This paper presents XALM
+(eXplainable Active Learning Metamodel), a three-step framework integrating
+active learning and SHAP (SHapley Additive exPlanations) values into simulation
+metamodels for supporting ATM decision-making. XALM efficiently uncovers hidden
+relationships among input and output variables in ATM simulators, those usually
+of interest in policy analysis. Our experiments show XALM's predictive
+performance comparable to the XGBoost metamodel with fewer simulations.
+Additionally, XALM exhibits superior explanatory capabilities compared to
+non-active learning metamodels.
+  Using the `Mercury' (flight and passenger) ATM simulator, XALM is applied to
+a real-world scenario in Paris Charles de Gaulle airport, extending an arrival
+manager's range and scope by analysing six variables. This case study
+illustrates XALM's effectiveness in enhancing simulation interpretability and
+understanding variable interactions. By addressing computational challenges and
+improving explainability, XALM complements traditional simulation-based
+analyses.
+  Lastly, we discuss two practical approaches for reducing the computational
+burden of the metamodelling further: we introduce a stopping criterion for
+active learning based on the inherent uncertainty of the metamodel, and we show
+how the simulations used for the metamodel can be reused across key performance
+indicators, thus decreasing the overall number of simulations needed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Machine Learning-based Fish Stock Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Lüdtke, Maria E. Pierce
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate assessment of fish stocks is crucial for sustainable fisheries
+management. However, existing statistical stock assessment models can have low
+forecast performance of relevant stock parameters like recruitment or spawning
+stock biomass, especially in ecosystems that are changing due to global warming
+and other anthropogenic stressors. In this paper, we investigate the use of
+machine learning models to improve the estimation and forecast of such stock
+parameters. We propose a hybrid model that combines classical statistical stock
+assessment models with supervised ML, specifically gradient boosted trees. Our
+hybrid model leverages the initial estimate provided by the classical model and
+uses the ML model to make a post-hoc correction to improve accuracy. We
+experiment with five different stocks and find that the forecast accuracy of
+recruitment and spawning stock biomass improves considerably in most cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Fragile Earth Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based
+  Residual U-Blocks Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhou Chen, Qian Huang, Yulin Chen, Linyi Qian, Chengyuan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nucleus image segmentation is a crucial step in the analysis, pathological
+diagnosis, and classification, which heavily relies on the quality of nucleus
+segmentation. However, the complexity of issues such as variations in nucleus
+size, blurred nucleus contours, uneven staining, cell clustering, and
+overlapping cells poses significant challenges. Current methods for nucleus
+segmentation primarily rely on nuclear morphology or contour-based approaches.
+Nuclear morphology-based methods exhibit limited generalization ability and
+struggle to effectively predict irregular-shaped nuclei, while contour-based
+extraction methods face challenges in accurately segmenting overlapping nuclei.
+To address the aforementioned issues, we propose a dual-branch network using
+hybrid attention based residual U-blocks for nucleus instance segmentation. The
+network simultaneously predicts target information and target contours.
+Additionally, we introduce a post-processing method that combines the target
+information and target contours to distinguish overlapping nuclei and generate
+an instance segmentation image. Within the network, we propose a context fusion
+block (CF-block) that effectively extracts and merges contextual information
+from the network. Extensive quantitative evaluations are conducted to assess
+the performance of our method. Experimental results demonstrate the superior
+performance of the proposed method compared to state-of-the-art approaches on
+the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Nucleus segmentation, Deep learning, Instance segmentation, Medical
+  imaging, Dual-Branch network</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A reading <span class="highlight-title">survey</span> on adversarial machine learning: Adversarial attacks
+  and their understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Kotyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning has empowered us to train neural networks for complex data with
+high performance. However, with the growing research, several vulnerabilities
+in neural networks have been exposed. A particular branch of research,
+Adversarial Machine Learning, exploits and understands some of the
+vulnerabilities that cause the neural networks to misclassify for near original
+input. A class of algorithms called adversarial attacks is proposed to make the
+neural networks misclassify for various tasks in different domains. With the
+extensive and growing research in adversarial attacks, it is crucial to
+understand the classification of adversarial attacks. This will help us
+understand the vulnerabilities in a systematic order and help us to mitigate
+the effects of adversarial attacks. This article provides a survey of existing
+adversarial attacks and their understanding based on different perspectives. We
+also provide a brief overview of existing adversarial defences and their
+limitations in mitigating the effect of adversarial attacks. Further, we
+conclude with a discussion on the future research directions in the field of
+adversarial machine learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Falkner-Skan type equations via Legendre and Chebyshev Neural
+  Blocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Afzal Aghaei, Kourosh Parand, Ali Nikkhah, Shakila Jaberi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, a new deep-learning architecture for solving the non-linear
+Falkner-Skan equation is proposed. Using Legendre and Chebyshev neural blocks,
+this approach shows how orthogonal polynomials can be used in neural networks
+to increase the approximation capability of artificial neural networks. In
+addition, utilizing the mathematical properties of these functions, we overcome
+the computational complexity of the backpropagation algorithm by using the
+operational matrices of the derivative. The efficiency of the proposed method
+is carried out by simulating various configurations of the Falkner-Skan
+equation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expediting Neural Network Verification via Network Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyi Zhong, Ruiwei Wang, Siau-Cheng Khoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide range of verification methods have been proposed to verify the safety
+properties of deep neural networks ensuring that the networks function
+correctly in critical applications. However, many well-known verification tools
+still struggle with complicated network architectures and large network sizes.
+In this work, we propose a network reduction technique as a pre-processing
+method prior to verification. The proposed method reduces neural networks via
+eliminating stable ReLU neurons, and transforming them into a sequential neural
+network consisting of ReLU and Affine layers which can be handled by the most
+verification tools. We instantiate the reduction technique on the
+state-of-the-art complete and incomplete verification tools, including
+alpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of
+benchmarks indicate that the proposed technique can significantly reduce neural
+networks and speed up existing verification tools. Furthermore, the experiment
+results also show that network reduction can improve the availability of
+existing verification tools on many networks by reducing them into sequential
+neural networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AFN: Adaptive Fusion Normalization via Encoder-Decoder Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Zhou, Huanran Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning is inseparable from normalization layers.
+Researchers have proposed various normalization functions, and each of them has
+both advantages and disadvantages. In response, efforts have been made to
+design a unified normalization function that combines all normalization
+procedures and mitigates their weaknesses. We also proposed a new normalization
+function called Adaptive Fusion Normalization. Through experiments, we
+demonstrate AFN outperforms the previous normalization techniques in domain
+generalization and image classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2106.01899 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Binary Federated Learning with Client-Level Differential Privacy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lumin Liu, Jun Zhang, Shenghui Song, Khaled B. Letaief
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a privacy-preserving collaborative learning
+framework, and differential privacy can be applied to further enhance its
+privacy protection. Existing FL systems typically adopt Federated Average
+(FedAvg) as the training algorithm and implement differential privacy with a
+Gaussian mechanism. However, the inherent privacy-utility trade-off in these
+systems severely degrades the training performance if a tight privacy budget is
+enforced. Besides, the Gaussian mechanism requires model weights to be of
+high-precision. To improve communication efficiency and achieve a better
+privacy-utility trade-off, we propose a communication-efficient FL training
+algorithm with differential privacy guarantee. Specifically, we propose to
+adopt binary neural networks (BNNs) and introduce discrete noise in the FL
+setting. Binary model parameters are uploaded for higher communication
+efficiency and discrete noise is added to achieve the client-level differential
+privacy protection. The achieved performance guarantee is rigorously proved,
+and it is shown to depend on the level of discrete noise. Experimental results
+based on MNIST and Fashion-MNIST datasets will demonstrate that the proposed
+training algorithm achieves client-level privacy protection with performance
+gain while enjoying the benefits of low communication overhead from binary
+model updates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 6 figures, accepted by IEEE GLOBECOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HomOpt: A Homotopy-Based Hyperparameter Optimization Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sophia J. Abraham, Kehelwala D. G. Maduranga, Jeffery Kinnison, Zachariah Carmichael, Jonathan D. Hauenstein, Walter J. Scheirer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning has achieved remarkable success over the past couple of
+decades, often attributed to a combination of algorithmic innovations and the
+availability of high-quality data available at scale. However, a third critical
+component is the fine-tuning of hyperparameters, which plays a pivotal role in
+achieving optimal model performance. Despite its significance, hyperparameter
+optimization (HPO) remains a challenging task for several reasons. Many HPO
+techniques rely on naive search methods or assume that the loss function is
+smooth and continuous, which may not always be the case. Traditional methods,
+like grid search and Bayesian optimization, often struggle to quickly adapt and
+efficiently search the loss landscape. Grid search is computationally
+expensive, while Bayesian optimization can be slow to prime. Since the search
+space for HPO is frequently high-dimensional and non-convex, it is often
+challenging to efficiently find a global minimum. Moreover, optimal
+hyperparameters can be sensitive to the specific dataset or task, further
+complicating the search process. To address these issues, we propose a new
+hyperparameter optimization method, HomOpt, using a data-driven approach based
+on a generalized additive model (GAM) surrogate combined with homotopy
+optimization. This strategy augments established optimization methodologies to
+boost the performance and effectiveness of any given method with faster
+convergence to the optimum on continuous, discrete, and categorical domain
+spaces. We compare the effectiveness of HomOpt applied to multiple optimization
+techniques (e.g., Random Search, TPE, Bayes, and SMAC) showing improved
+objective performance on many standardized machine learning benchmarks and
+challenging open-set recognition tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Q-Network for Stochastic Process Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuangheng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is a powerful approach for training an optimal policy
+to solve complex problems in a given system. This project aims to demonstrate
+the application of reinforcement learning in stochastic process environments
+with missing information, using Flappy Bird and a newly developed stock trading
+environment as case studies. We evaluate various structures of Deep Q-learning
+networks and identify the most suitable variant for the stochastic process
+environment. Additionally, we discuss the current challenges and propose
+potential improvements for further work in environment-building and
+reinforcement learning techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetry-Preserving Program Representations for Learning Code Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Pei, Weichen Li, Qirui Jin, Shuyang Liu, Scott Geng, Lorenzo Cavallaro, Junfeng Yang, Suman Jana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promise in automated program
+reasoning, a crucial aspect of many security tasks. However, existing LLM
+architectures for code are often borrowed from other domains like natural
+language processing, raising concerns about their generalization and robustness
+to unseen code. A key generalization challenge is to incorporate the knowledge
+of code semantics, including control and data flow, into the LLM architectures.
+  Drawing inspiration from examples of convolution layers exploiting
+translation symmetry, we explore how code symmetries can enhance LLM
+architectures for program analysis and modeling. We present a rigorous
+group-theoretic framework that formally defines code symmetries as
+semantics-preserving transformations and provides techniques for precisely
+reasoning about symmetry preservation within LLM architectures. Using this
+framework, we introduce a novel variant of self-attention that preserves
+program symmetries, demonstrating its effectiveness in generalization and
+robustness through detailed experimental evaluations across different binary
+and source code analysis tasks. Overall, our code symmetry framework offers
+rigorous and powerful reasoning techniques that can guide the future
+development of specialized LLMs for code and advance LLM-guided program
+reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Graph Neural Diffusion Based on Constrained Dirichlet Energy
+  Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoji Fu, Mohammed Haroon Dupty, Yanfei Dong, Lee Wee Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit graph neural networks (GNNs) have emerged as a potential approach to
+enable GNNs to capture long-range dependencies effectively. However, poorly
+designed implicit GNN layers can experience over-smoothing or may have limited
+adaptability to learn data geometry, potentially hindering their performance in
+graph learning problems. To address these issues, we introduce a geometric
+framework to design implicit graph diffusion layers based on a parameterized
+graph Laplacian operator. Our framework allows learning the geometry of vertex
+and edge spaces, as well as the graph gradient operator from data. We further
+show how implicit GNN layers can be viewed as the fixed-point solution of a
+Dirichlet energy minimization problem and give conditions under which it may
+suffer from over-smoothing. To overcome the over-smoothing problem, we design
+our implicit graph diffusion layer as the solution of a Dirichlet energy
+minimization problem with constraints on vertex features, enabling it to trade
+off smoothing with the preservation of node feature information. With an
+appropriate hyperparameter set to be larger than the largest eigenvalue of the
+parameterized graph Laplacian, our framework guarantees a unique equilibrium
+and quick convergence. Our models demonstrate better performance than leading
+implicit and explicit GNNs on benchmark datasets for node and graph
+classification tasks, with substantial accuracy improvements observed for some
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do You Remember? Overcoming Catastrophic Forgetting for Fake Audio
+  Detection <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohui Zhang, Jiangyan Yi, Jianhua Tao, Chenglong Wang, Chuyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current fake audio detection algorithms have achieved promising performances
+on most datasets. However, their performance may be significantly degraded when
+dealing with audio of a different dataset. The orthogonal weight modification
+to overcome catastrophic forgetting does not consider the similarity of genuine
+audio across different datasets. To overcome this limitation, we propose a
+continual learning algorithm for fake audio detection to overcome catastrophic
+forgetting, called Regularized Adaptive Weight Modification (RAWM). When
+fine-tuning a detection network, our approach adaptively computes the direction
+of weight modification according to the ratio of genuine utterances and fake
+utterances. The adaptive modification direction ensures the network can
+effectively detect fake audio on the new dataset while preserving its knowledge
+of old model, thus mitigating catastrophic forgetting. In addition, genuine
+audio collected from quite different acoustic conditions may skew their feature
+distribution, so we introduce a regularization constraint to force the network
+to remember the old distribution in this regard. Our method can easily be
+generalized to related fields, like speech emotion recognition. We also
+evaluate our approach across multiple datasets and obtain a significant
+performance improvement on cross-dataset experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40th Internation Conference on Machine Learning (ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying Large Language Model Generalization with Influence Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roger Grosse, Juhan Bae, Cem Anil, Nelson Elhage, Alex Tamkin, Amirhossein Tajdini, Benoit Steiner, Dustin Li, Esin Durmus, Ethan Perez, Evan Hubinger, Kamilė Lukošiūtė, Karina Nguyen, Nicholas Joseph, Sam McCandlish, Jared Kaplan, Samuel R. Bowman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When trying to gain better visibility into a machine learning model in order
+to understand and mitigate the associated risks, a potentially valuable source
+of evidence is: which training examples most contribute to a given behavior?
+Influence functions aim to answer a counterfactual: how would the model's
+parameters (and hence its outputs) change if a given sequence were added to the
+training set? While influence functions have produced insights for small
+models, they are difficult to scale to large language models (LLMs) due to the
+difficulty of computing an inverse-Hessian-vector product (IHVP). We use the
+Eigenvalue-corrected Kronecker-Factored Approximate Curvature (EK-FAC)
+approximation to scale influence functions up to LLMs with up to 52 billion
+parameters. In our experiments, EK-FAC achieves similar accuracy to traditional
+influence function estimators despite the IHVP computation being orders of
+magnitude faster. We investigate two algorithmic techniques to reduce the cost
+of computing gradients of candidate training sequences: TF-IDF filtering and
+query batching. We use influence functions to investigate the generalization
+patterns of LLMs, including the sparsity of the influence patterns, increasing
+abstraction with scale, math and programming abilities, cross-lingual
+generalization, and role-playing behavior. Despite many apparently
+sophisticated forms of generalization, we identify a surprising limitation:
+influences decay to near-zero when the order of key phrases is flipped.
+Overall, influence functions give us a powerful new tool for studying the
+generalization properties of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>119 pages, 47 figures, 22 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DOMINO: Domain-invariant Hyperdimensional Classification for
+  Multi-Sensor Time Series Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyao Wang, Luke Chen, Mohammad Abdullah Al Faruque
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid evolution of the Internet of Things, many real-world
+applications utilize heterogeneously connected sensors to capture time-series
+information. Edge-based machine learning (ML) methodologies are often employed
+to analyze locally collected data. However, a fundamental issue across
+data-driven ML approaches is distribution shift. It occurs when a model is
+deployed on a data distribution different from what it was trained on, and can
+substantially degrade model performance. Additionally, increasingly
+sophisticated deep neural networks (DNNs) have been proposed to capture spatial
+and temporal dependencies in multi-sensor time series data, requiring intensive
+computational resources beyond the capacity of today's edge devices. While
+brain-inspired hyperdimensional computing (HDC) has been introduced as a
+lightweight solution for edge-based learning, existing HDCs are also vulnerable
+to the distribution shift challenge. In this paper, we propose DOMINO, a novel
+HDC learning framework addressing the distribution shift problem in noisy
+multi-sensor time-series data. DOMINO leverages efficient and parallel matrix
+operations on high-dimensional space to dynamically identify and filter out
+domain-variant dimensions. Our evaluation on a wide range of multi-sensor time
+series classification tasks shows that DOMINO achieves on average 2.04% higher
+accuracy than state-of-the-art (SOTA) DNN-based domain generalization
+techniques, and delivers 7.83x faster training and 26.94x faster inference.
+More importantly, DOMINO performs notably better when learning from partially
+labeled and highly imbalanced data, providing 10.93x higher robustness against
+hardware noises than SOTA DNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SynJax: Structured Probability Distributions for JAX 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miloš Stanojević, Laurent Sartran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of deep learning software libraries enabled significant
+progress in the field by allowing users to focus on modeling, while letting the
+library to take care of the tedious and time-consuming task of optimizing
+execution for modern hardware accelerators. However, this has benefited only
+particular types of deep learning models, such as Transformers, whose
+primitives map easily to the vectorized computation. The models that explicitly
+account for structured objects, such as trees and segmentations, did not
+benefit equally because they require custom algorithms that are difficult to
+implement in a vectorized form.
+  SynJax directly addresses this problem by providing an efficient vectorized
+implementation of inference algorithms for structured distributions covering
+alignment, tagging, segmentation, constituency trees and spanning trees. With
+SynJax we can build large-scale differentiable models that explicitly model
+structure in the data. The code is available at
+https://github.com/deepmind/synjax.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLIQS: One-Shot Mixed-Precision Floating-Point and Integer Quantization
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Dotzel, Gang Wu, Andrew Li, Muhammad Umar, Yun Ni, Mohamed S. Abdelfattah, Zhiru Zhang, Liqun Cheng, Martin G. Dixon, Norman P. Jouppi, Quoc V. Le, Sheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has become a mainstream compression technique for reducing model
+size, computational requirements, and energy consumption for modern deep neural
+networks (DNNs). With the improved numerical support in recent hardware,
+including multiple variants of integer and floating point, mixed-precision
+quantization has become necessary to achieve high-quality results with low
+model cost. Prior mixed-precision quantization methods have performed a
+post-training quantization search, which compromises on accuracy, or a
+differentiable quantization search, which leads to high memory usage from
+branching. Therefore, we propose the first one-shot mixed-precision
+quantization search that eliminates the need for retraining in both integer and
+low-precision floating point models. We evaluate our floating-point and integer
+quantization search (FLIQS) on multiple convolutional networks and vision
+transformer models to discover Pareto-optimal models. Our approach discovers
+models that improve upon uniform precision, manual mixed-precision, and recent
+integer quantization search methods. With the proposed integer quantization
+search, we increase the accuracy of ResNet-18 on ImageNet by 1.31% points and
+ResNet-50 by 0.90% points with equivalent model cost over previous methods.
+Additionally, for the first time, we explore a novel mixed-precision
+floating-point search and improve MobileNetV2 by up to 0.98% points compared to
+prior state-of-the-art FP8 models. Finally, we extend FLIQS to simultaneously
+search a joint quantization and neural architecture space and improve the
+ImageNet accuracy by 2.69% points with similar model cost on a MobileNetV2
+search space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-rate discretely-modulated continuous-variable quantum key
+  distribution using quantum machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qin Liao, Jieyu Liu, Anqi Huang, Lei Huang, Zhuoying Fei, Xiquan Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a high-rate scheme for discretely-modulated continuous-variable
+quantum key distribution (DM CVQKD) using quantum machine learning
+technologies, which divides the whole CVQKD system into three parts, i.e., the
+initialization part that is used for training and estimating quantum
+classifier, the prediction part that is used for generating highly correlated
+raw keys, and the data-postprocessing part that generates the final secret key
+string shared by Alice and Bob. To this end, a low-complexity quantum k-nearest
+neighbor (QkNN) classifier is designed for predicting the lossy
+discretely-modulated coherent states (DMCSs) at Bob's side. The performance of
+the proposed QkNN-based CVQKD especially in terms of machine learning metrics
+and complexity is analyzed, and its theoretical security is proved by using
+semi-definite program (SDP) method. Numerical simulation shows that the secret
+key rate of our proposed scheme is explicitly superior to the existing DM CVQKD
+protocols, and it can be further enhanced with the increase of modulation
+variance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSformer: A Double Sampling <span class="highlight-title">Transformer</span> for Multivariate Time Series
+  Long-term Prediction <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengqing Yu, Fei Wang, Zezhi Shao, Tao Sun, Lin Wu, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate time series long-term prediction, which aims to predict the
+change of data in a long time, can provide references for decision-making.
+Although transformer-based models have made progress in this field, they
+usually do not make full use of three features of multivariate time series:
+global information, local information, and variables correlation. To
+effectively mine the above three features and establish a high-precision
+prediction model, we propose a double sampling transformer (DSformer), which
+consists of the double sampling (DS) block and the temporal variable attention
+(TVA) block. Firstly, the DS block employs down sampling and piecewise sampling
+to transform the original series into feature vectors that focus on global
+information and local information respectively. Then, TVA block uses temporal
+attention and variable attention to mine these feature vectors from different
+dimensions and extract key information. Finally, based on a parallel structure,
+DSformer uses multiple TVA blocks to mine and integrate different features
+obtained from DS blocks respectively. The integrated feature information is
+passed to the generative decoder based on a multi-layer perceptron to realize
+multivariate time series long-term prediction. Experimental results on nine
+real-world datasets show that DSformer can outperform eight existing baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CIKM 2023 (FULL paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Structure-aware Graph Contrastive Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Yang, Yuan Liu, Zijuan Zhao, Peijin Ding, Wenqian Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Graph Neural Network (GNN), as a graph representation learning
+method, is constrained by label information. However, Graph Contrastive
+Learning (GCL) methods, which tackle the label problem effectively, mainly
+focus on the feature information of the global graph or small subgraph
+structure (e.g., the first-order neighborhood). In the paper, we propose a
+Local Structure-aware Graph Contrastive representation Learning method (LS-GCL)
+to model the structural information of nodes from multiple views. Specifically,
+we construct the semantic subgraphs that are not limited to the first-order
+neighbors. For the local view, the semantic subgraph of each target node is
+input into a shared GNN encoder to obtain the target node embeddings at the
+subgraph-level. Then, we use a pooling function to generate the subgraph-level
+graph embeddings. For the global view, considering the original graph preserves
+indispensable semantic information of nodes, we leverage the shared GNN encoder
+to learn the target node embeddings at the global graph-level. The proposed
+LS-GCL model is optimized to maximize the common information among similar
+instances at three various perspectives through a multi-level contrastive loss
+function. Experimental results on five datasets illustrate that our method
+outperforms state-of-the-art graph representation learning approaches for both
+node classification and link prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Rule Injection for ComplEx Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodi Ma, Anthony Colas, Yuejie Wang, Ali Sadeghian, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works in neural knowledge graph inference attempt to combine logic
+rules with knowledge graph embeddings to benefit from prior knowledge. However,
+they usually cannot avoid rule grounding, and injecting a diverse set of rules
+has still not been thoroughly explored. In this work, we propose InjEx, a
+mechanism to inject multiple types of rules through simple constraints, which
+capture definite Horn rules. To start, we theoretically prove that InjEx can
+inject such rules. Next, to demonstrate that InjEx infuses interpretable prior
+knowledge into the embedding space, we evaluate InjEx on both the knowledge
+graph completion (KGC) and few-shot knowledge graph completion (FKGC) settings.
+Our experimental results reveal that InjEx outperforms both baseline KGC models
+as well as specialized few-shot models while maintaining its scalability and
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Different Time-series-<span class="highlight-title">Transformer</span> (TST) Architectures: A Case
+  Study in Battery Life Prediction for Electric Vehicles (EVs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niranjan Sitapure, Atharva Kulkarni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, battery technology for electric vehicles (EVs) has been a
+major focus, with a significant emphasis on developing new battery materials
+and chemistries. However, accurately predicting key battery parameters, such as
+state-of-charge (SOC) and temperature, remains a challenge for constructing
+advanced battery management systems (BMS). Existing battery models do not
+comprehensively cover all parameters affecting battery performance, including
+non-battery-related factors like ambient temperature, cabin temperature,
+elevation, and regenerative braking during EV operation. Due to the difficulty
+of incorporating these auxiliary parameters into traditional models, a
+data-driven approach is suggested. Time-series-transformers (TSTs), leveraging
+multiheaded attention and parallelization-friendly architecture, are explored
+alongside LSTM models. Novel TST architectures, including encoder TST + decoder
+LSTM and a hybrid TST-LSTM, are also developed and compared against existing
+models. A dataset comprising 72 driving trips in a BMW i3 (60 Ah) is used to
+address battery life prediction in EVs, aiming to create accurate TST models
+that incorporate environmental, battery, vehicle driving, and heating circuit
+data to predict SOC and battery temperature for future time steps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages and 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Approximation and Learning Rates for Deep Convolutional Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shao-Bo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on approximation and learning performance analysis for
+deep convolutional neural networks with zero-padding and max-pooling. We prove
+that, to approximate $r$-smooth function, the approximation rates of deep
+convolutional neural networks with depth $L$ are of order $ (L^2/\log
+L)^{-2r/d} $, which is optimal up to a logarithmic factor. Furthermore, we
+deduce almost optimal learning rates for implementing empirical risk
+minimization over deep convolutional neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Adversarial Detection without Extra Model: Training Loss
+  Should Change <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien Cheng Chyou, Hung-Ting Su, Winston H. Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial robustness poses a critical challenge in the deployment of deep
+learning models for real-world applications. Traditional approaches to
+adversarial training and supervised detection rely on prior knowledge of attack
+types and access to labeled training data, which is often impractical. Existing
+unsupervised adversarial detection methods identify whether the target model
+works properly, but they suffer from bad accuracies owing to the use of common
+cross-entropy training loss, which relies on unnecessary features and
+strengthens adversarial attacks. We propose new training losses to reduce
+useless features and the corresponding detection method without prior knowledge
+of adversarial attacks. The detection rate (true positive rate) against all
+given white-box attacks is above 93.9% except for attacks without limits
+(DF($\infty$)), while the false positive rate is barely 2.5%. The proposed
+method works well in all tested attack types and the false positive rates are
+even better than the methods good at certain types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AdvML in ICML 2023
+  code:https://github.com/CycleBooster/Unsupervised-adversarial-detection-without-extra-model</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Asynchronous Decentralized Q-Learning: Two Timescale Analysis By
+  Persistence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bora Yongacoglu, Gürdal Arslan, Serdar Yüksel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-stationarity is a fundamental challenge in multi-agent reinforcement
+learning (MARL), where agents update their behaviour as they learn. Many
+theoretical advances in MARL avoid the challenge of non-stationarity by
+coordinating the policy updates of agents in various ways, including
+synchronizing times at which agents are allowed to revise their policies.
+Synchronization enables analysis of many MARL algorithms via multi-timescale
+methods, but such synchrony is infeasible in many decentralized applications.
+In this paper, we study an asynchronous variant of the decentralized Q-learning
+algorithm, a recent MARL algorithm for stochastic games. We provide sufficient
+conditions under which the asynchronous algorithm drives play to equilibrium
+with high probability. Our solution utilizes constant learning rates in the
+Q-factor update, which we show to be critical for relaxing the synchrony
+assumptions of earlier work. Our analysis also applies to asynchronous
+generalizations of a number of other algorithms from the regret testing
+tradition, whose performance is analyzed by multi-timescale methods that study
+Markov chains obtained via policy update dynamics. This work extends the
+applicability of the decentralized Q-learning algorithm and its relatives to
+settings in which parameters are selected in an independent manner, and tames
+non-stationarity without imposing the coordination assumptions of prior work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ G-Mix: A Generalized Mixup Learning Framework Towards Flat Minima 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Li, Bo Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have demonstrated promising results in various
+complex tasks. However, current DNNs encounter challenges with
+over-parameterization, especially when there is limited training data
+available. To enhance the generalization capability of DNNs, the Mixup
+technique has gained popularity. Nevertheless, it still produces suboptimal
+outcomes. Inspired by the successful Sharpness-Aware Minimization (SAM)
+approach, which establishes a connection between the sharpness of the training
+loss landscape and model generalization, we propose a new learning framework
+called Generalized-Mixup, which combines the strengths of Mixup and SAM for
+training DNN models. The theoretical analysis provided demonstrates how the
+developed G-Mix framework enhances generalization. Additionally, to further
+optimize DNN performance with the G-Mix framework, we introduce two novel
+algorithms: Binary G-Mix and Decomposed G-Mix. These algorithms partition the
+training data into two subsets based on the sharpness-sensitivity of each
+example to address the issue of "manifold intrusion" in Mixup. Both theoretical
+explanations and experimental results reveal that the proposed BG-Mix and
+DG-Mix algorithms further enhance model generalization across multiple datasets
+and models, achieving state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of the Evolution of Advanced <span class="highlight-title">Transformer</span>-Based Language Models:
+  Experiments on Opinion Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nour Eddine Zekaoui, Siham Yousfi, Maryem Rhanoui, Mounia Mikram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opinion mining, also known as sentiment analysis, is a subfield of natural
+language processing (NLP) that focuses on identifying and extracting subjective
+information in textual material. This can include determining the overall
+sentiment of a piece of text (e.g., positive or negative), as well as
+identifying specific emotions or opinions expressed in the text, that involves
+the use of advanced machine and deep learning techniques. Recently,
+transformer-based language models make this task of human emotion analysis
+intuitive, thanks to the attention mechanism and parallel computation. These
+advantages make such models very powerful on linguistic tasks, unlike recurrent
+neural networks that spend a lot of time on sequential processing, making them
+prone to fail when it comes to processing long text. The scope of our paper
+aims to study the behaviour of the cutting-edge Transformer-based language
+models on opinion mining and provide a high-level comparison between them to
+highlight their key particularities. Additionally, our comparative study shows
+leads and paves the way for production engineers regarding the approach to
+focus on and is useful for researchers as it provides guidelines for future
+research subjects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imbalanced Large Graph Learning Framework for FPGA Logic Elements
+  Packing Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixiong Di, Runzhe Tao, Lin Chen, Qiang Wu, Yibo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Packing is a required step in a typical FPGA CAD flow. It has high impacts to
+the performance of FPGA placement and routing. Early prediction of packing
+results can guide design optimization and expedite design closure. In this
+work, we propose an imbalanced large graph learning framework, ImLG, for
+prediction of whether logic elements will be packed after placement.
+Specifically, we propose dedicated feature extraction and feature aggregation
+methods to enhance the node representation learning of circuit graphs. With
+imbalanced distribution of packed and unpacked logic elements, we further
+propose techniques such as graph oversampling and mini-batch training for this
+imbalanced learning task in large circuit graphs. Experimental results
+demonstrate that our framework can improve the F1 score by 42.82% compared to
+the most recent Gaussian-based prediction method. Physical design results show
+that the proposed method can assist the placer in improving routed wirelength
+by 0.93% and SLICE occupation by 0.89%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tractability of approximation by general shallow networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrushikesh Mhaskar, Tong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a sharper version of the results in the paper
+Dimension independent bounds for general shallow networks; Neural Networks,
+\textbf{123} (2020), 142-152. Let $\mathbb{X}$ and $\mathbb{Y}$ be compact
+metric spaces. We consider approximation of functions of the form $
+x\mapsto\int_{\mathbb{Y}} G( x, y)d\tau( y)$, $ x\in\mathbb{X}$, by
+$G$-networks of the form $ x\mapsto \sum_{k=1}^n a_kG( x, y_k)$, $ y_1,\cdots,
+y_n\in\mathbb{Y}$, $a_1,\cdots, a_n\in\mathbb{R}$. Defining the dimensions of
+$\mathbb{X}$ and $\mathbb{Y}$ in terms of covering numbers, we obtain dimension
+independent bounds on the degree of approximation in terms of $n$, where also
+the constants involved are all dependent at most polynomially on the
+dimensions. Applications include approximation by power rectified linear unit
+networks, zonal function networks, certain radial basis function networks as
+well as the important problem of function extension to higher dimensional
+spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amortized Global Search for Efficient Preliminary Trajectory Design with
+  Deep Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjian Li, Amlan Sinha, Ryne Beeson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preliminary trajectory design is a global search problem that seeks multiple
+qualitatively different solutions to a trajectory optimization problem. Due to
+its high dimensionality and non-convexity, and the frequent adjustment of
+problem parameters, the global search becomes computationally demanding. In
+this paper, we exploit the clustering structure in the solutions and propose an
+amortized global search (AmorGS) framework. We use deep generative models to
+predict trajectory solutions that share similar structures with previously
+solved problems, which accelerates the global search for unseen parameter
+values. Our method is evaluated using De Jong's 5th function and a low-thrust
+circular restricted three-body problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixed Inter-Neuron Covariability Induces Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ahmed Shah, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vulnerability to adversarial perturbations is a major flaw of Deep Neural
+Networks (DNNs) that raises question about their reliability when in real-world
+scenarios. On the other hand, human perception, which DNNs are supposed to
+emulate, is highly robust to such perturbations, indicating that there may be
+certain features of the human perception that make it robust but are not
+represented in the current class of DNNs. One such feature is that the activity
+of biological neurons is correlated and the structure of this correlation tends
+to be rather rigid over long spans of times, even if it hampers performance and
+learning. We hypothesize that integrating such constraints on the activations
+of a DNN would improve its adversarial robustness, and, to test this
+hypothesis, we have developed the Self-Consistent Activation (SCA) layer, which
+comprises of neurons whose activations are consistent with each other, as they
+conform to a fixed, but learned, covariability pattern. When evaluated on image
+and sound recognition tasks, the models with a SCA layer achieved high
+accuracy, and exhibited significantly greater robustness than multi-layer
+perceptron models to state-of-the-art Auto-PGD adversarial attacks
+\textit{without being trained on adversarially perturbed data
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PMU measurements based short-term voltage stability assessment of power
+  systems via deep transfer learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Li, Shitu Zhang, Yuanzheng Li, Jiting Cao, Shuyue Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has emerged as an effective solution for addressing the
+challenges of short-term voltage stability assessment (STVSA) in power systems.
+However, existing deep learning-based STVSA approaches face limitations in
+adapting to topological changes, sample labeling, and handling small datasets.
+To overcome these challenges, this paper proposes a novel phasor measurement
+unit (PMU) measurements-based STVSA method by using deep transfer learning. The
+method leverages the real-time dynamic information captured by PMUs to create
+an initial dataset. It employs temporal ensembling for sample labeling and
+utilizes least squares generative adversarial networks (LSGAN) for data
+augmentation, enabling effective deep learning on small-scale datasets.
+Additionally, the method enhances adaptability to topological changes by
+exploring connections between different faults. Experimental results on the
+IEEE 39-bus test system demonstrate that the proposed method improves model
+evaluation accuracy by approximately 20% through transfer learning, exhibiting
+strong adaptability to topological changes. Leveraging the self-attention
+mechanism of the Transformer model, this approach offers significant advantages
+over shallow learning methods and other deep learning-based approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Instrumentation & Measurement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning
+  with <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03945v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03945v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulan Gao, Hao Sun, Zengxiang Li, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) addresses data privacy concerns by enabling
+collaborative training of AI models across distributed data owners. Wide
+adoption of FL faces the fundamental challenges of data heterogeneity and the
+large scale of data owners involved. In this paper, we investigate the prospect
+of Transformer-based FL models for achieving generalization and personalization
+in this setting. We conduct extensive comparative experiments involving FL with
+Transformers, ResNet, and personalized ResNet-based FL approaches under various
+scenarios. These experiments consider varying numbers of data owners to
+demonstrate Transformers' advantages over deep neural networks in large-scale
+heterogeneous FL tasks. In addition, we analyze the superior performance of
+Transformers by comparing the Centered Kernel Alignment (CKA) representation
+similarity across different layers and FL models to gain insight into the
+reasons behind their promising capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraPhSyM: Graph Physical Synthesis Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Agiza, Rajarshi Roy, Teodor Dumitru Ene, Saad Godil, Sherief Reda, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce GraPhSyM, a Graph Attention Network (GATv2) model
+for fast and accurate estimation of post-physical synthesis circuit delay and
+area metrics from pre-physical synthesis circuit netlists. Once trained,
+GraPhSyM provides accurate visibility of final design metrics to early EDA
+stages, such as logic synthesis, without running the slow physical synthesis
+flow, enabling global co-optimization across stages. Additionally, the swift
+and precise feedback provided by GraPhSym is instrumental for
+machine-learning-based EDA optimization frameworks. Given a gate-level netlist
+of a circuit represented as a graph, GraPhSyM utilizes graph structure,
+connectivity, and electrical property features to predict the impact of
+physical synthesis transformations such as buffer insertion and gate sizing.
+When trained on a dataset of 6000 prefix adder designs synthesized at an
+aggressive delay target, GraPhSyM can accurately predict the post-synthesis
+delay (98.3%) and area (96.1%) metrics of unseen adders with a fast 0.22s
+inference time. Furthermore, we illustrate the compositionality of GraPhSyM by
+employing the model trained on a fixed delay target to accurately anticipate
+post-synthesis metrics at a variety of unseen delay targets. Lastly, we report
+promising generalization capabilities of the GraPhSyM model when it is
+evaluated on circuits different from the adders it was exclusively trained on.
+The results show the potential for GraPhSyM to serve as a powerful tool for
+advanced optimization techniques and as an oracle for EDA machine learning
+frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCAD'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing the switching operation in monoclonal antibody production:
+  Economic MPC and reinforcement learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sandra A. Obiri, Song Bo, Bernard T. Agyeman, Benjamin Decardi-Nelson, Jinfeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monoclonal antibodies (mAbs) have emerged as indispensable assets in
+medicine, and are currently at the forefront of biopharmaceutical product
+development. However, the growing market demand and the substantial doses
+required for mAb clinical treatments necessitate significant progress in its
+large-scale production. Most of the processes for industrial mAb production
+rely on batch operations, which result in significant downtime. The shift
+towards a fully continuous and integrated manufacturing process holds the
+potential to boost product yield and quality, while eliminating the extra
+expenses associated with storing intermediate products. The integrated
+continuous mAb production process can be divided into the upstream and
+downstream processes. One crucial aspect that ensures the continuity of the
+integrated process is the switching of the capture columns, which are typically
+chromatography columns operated in a fed-batch manner downstream. Due to the
+discrete nature of the switching operation, advanced process control algorithms
+such as economic MPC (EMPC) are computationally difficult to implement. This is
+because an integer nonlinear program (INLP) needs to be solved online at each
+sampling time. This paper introduces two computationally-efficient approaches
+for EMPC implementation, namely, a sigmoid function approximation approach and
+a rectified linear unit (ReLU) approximation approach. It also explores the
+application of deep reinforcement learning (DRL). These three methods are
+compared to the traditional switching approach which is based on a 1% product
+breakthrough rule and which involves no optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting and explaining nonlinear material response using deep
+  Physically Guided Neural Networks with Internal Variables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Orera-Echeverria, Jacobo Ayensa-Jiménez, Manuel Doblare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonlinear materials are often difficult to model with classical state model
+theory because they have a complex and sometimes inaccurate physical and
+mathematical description or we simply do not know how to describe such
+materials in terms of relations between external and internal variables. In
+many disciplines, Neural Network methods have arisen as powerful tools to
+identify very complex and non-linear correlations. In this work, we use the
+very recently developed concept of Physically Guided Neural Networks with
+Internal Variables (PGNNIV) to discover constitutive laws using a model-free
+approach and training solely with measured force-displacement data. PGNNIVs
+make a particular use of the physics of the problem to enforce constraints on
+specific hidden layers and are able to make predictions without internal
+variable data. We demonstrate that PGNNIVs are capable of predicting both
+internal and external variables under unseen load scenarios, regardless of the
+nature of the material considered (linear, with hardening or softening behavior
+and hyperelastic), unravelling the constitutive law of the material hence
+explaining its nature altogether, placing the method in what is known as
+eXplainable Artificial Intelligence (XAI).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main text: 25 pages, 6 figures. Appendices: 13 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings
+  for Video Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumyabrata Chaudhuri, Saumik Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Action Recognition (VAR) is a challenging task due to its inherent
+complexities. Though different approaches have been explored in the literature,
+designing a unified framework to recognize a large number of human actions is
+still a challenging problem. Recently, Multi-Modal Learning (MML) has
+demonstrated promising results in this domain. In literature, 2D skeleton or
+pose modality has often been used for this task, either independently or in
+conjunction with the visual information (RGB modality) present in videos.
+However, the combination of pose, visual information, and text attributes has
+not been explored yet, though text and pose attributes independently have been
+proven to be effective in numerous computer vision tasks. In this paper, we
+present the first pose augmented Vision-language model (VLM) for VAR. Notably,
+our scheme achieves an accuracy of 92.81% and 73.02% on two popular human video
+action recognition benchmark datasets, UCF-101 and HMDB-51, respectively, even
+without any video data pre-training, and an accuracy of 96.11% and 75.75% after
+kinetics pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 2 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancements In Crowd-Monitoring System: A Comprehensive Analysis of
+  Systematic Approaches and Automation Algorithms: State-of-The-Art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Ameen, Richard Stone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Growing apprehensions surrounding public safety have captured the attention
+of numerous governments and security agencies across the globe. These entities
+are increasingly acknowledging the imperative need for reliable and secure
+crowd-monitoring systems to address these concerns. Effectively managing human
+gatherings necessitates proactive measures to prevent unforeseen events or
+complications, ensuring a safe and well-coordinated environment. The scarcity
+of research focusing on crowd monitoring systems and their security
+implications has given rise to a burgeoning area of investigation, exploring
+potential approaches to safeguard human congregations effectively. Crowd
+monitoring systems depend on a bifurcated approach, encompassing vision-based
+and non-vision-based technologies. An in-depth analysis of these two
+methodologies will be conducted in this research. The efficacy of these
+approaches is contingent upon the specific environment and temporal context in
+which they are deployed, as they each offer distinct advantages. This paper
+endeavors to present an in-depth analysis of the recent incorporation of
+artificial intelligence (AI) algorithms and models into automated systems,
+emphasizing their contemporary applications and effectiveness in various
+contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Assistant Language Understanding On Device 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aas, Hisham Abdelsalam, Irina Belousova, Shruti Bhargava, Jianpeng Cheng, Robert Daland, Joris Driesen, Federico Flego, Tristan Guigue, Anders Johannsen, Partha Lal, Jiarui Lu, Joel Ruben Antony Moniz, Nathan Perkins, Dhivya Piraviperumal, Stephen Pulman, Diarmuid Ó Séaghdha, David Q. Sun, John Torr, Marco Del Vecchio, Jay Wacker, Jason D. Williams, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has recently become feasible to run personal digital assistants on phones
+and other personal devices. In this paper we describe a design for a natural
+language understanding system that runs on device. In comparison to a
+server-based assistant, this system is more private, more reliable, faster,
+more expressive, and more accurate. We describe what led to key choices about
+architecture and technologies. For example, some approaches in the dialog
+systems literature are difficult to maintain over time in a deployment setting.
+We hope that sharing learnings from our practical experiences may help inform
+future work in the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On genuine invariance learning without weight-tying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artem Moskalev, Anna Sepliarskaia, Erik J. Bekkers, Arnold Smeulders
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate properties and limitations of invariance
+learned by neural networks from the data compared to the genuine invariance
+achieved through invariant weight-tying. To do so, we adopt a group theoretical
+perspective and analyze invariance learning in neural networks without
+weight-tying constraints. We demonstrate that even when a network learns to
+correctly classify samples on a group orbit, the underlying decision-making in
+such a model does not attain genuine invariance. Instead, learned invariance is
+strongly conditioned on the input data, rendering it unreliable if the input
+distribution shifts. We next demonstrate how to guide invariance learning
+toward genuine invariance by regularizing the invariance of a model at the
+training. To this end, we propose several metrics to quantify learned
+invariance: (i) predictive distribution invariance, (ii) logit invariance, and
+(iii) saliency invariance similarity. We show that the invariance learned with
+the invariance error regularization closely reassembles the genuine invariance
+of weight-tying models and reliably holds even under a severe input
+distribution shift. Closer analysis of the learned invariance also reveals the
+spectral decay phenomenon, when a network chooses to achieve the invariance to
+a specific transformation group by reducing the sensitivity to any input
+perturbation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLIPS: Federated Learning using Intelligent Participant Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Atul Bhope, K. R. Jayaram, Nalini Venkatasubramanian, Ashish Verma, Gegi Thomas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the design and implementation of FLIPS, a middleware
+system to manage data and participant heterogeneity in federated learning (FL)
+training workloads. In particular, we examine the benefits of label
+distribution clustering on participant selection in federated learning. FLIPS
+clusters parties involved in an FL training job based on the label distribution
+of their data apriori, and during FL training, ensures that each cluster is
+equitably represented in the participants selected. FLIPS can support the most
+common FL algorithms, including FedAvg, FedProx, FedDyn, FedOpt and FedYogi. To
+manage platform heterogeneity and dynamic resource availability, FLIPS
+incorporates a straggler management mechanism to handle changing capacities in
+distributed, smart community applications. Privacy of label distributions,
+clustering and participant selection is ensured through a trusted execution
+environment (TEE). Our comprehensive empirical evaluation compares FLIPS with
+random participant selection, as well as two other "smart" selection mechanisms
+- Oort and gradient clustering using two real-world datasets, two different
+non-IID distributions and three common FL algorithms (FedYogi, FedProx and
+FedAvg). We demonstrate that FLIPS significantly improves convergence,
+achieving higher accuracy by 17 - 20 % with 20 - 60 % lower communication
+costs, and these benefits endure in the presence of straggler participants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable and Equitable Math Problem Solving Strategy Prediction in Big
+  Educational Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anup Shakya, Vasile Rus, Deepak Venugopal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding a student's problem-solving strategy can have a significant
+impact on effective math learning using Intelligent Tutoring Systems (ITSs) and
+Adaptive Instructional Systems (AISs). For instance, the ITS/AIS can better
+personalize itself to correct specific misconceptions that are indicated by
+incorrect strategies, specific problems can be designed to improve strategies
+and frustration can be minimized by adapting to a student's natural way of
+thinking rather than trying to fit a standard strategy for all. While it may be
+possible for human experts to identify strategies manually in classroom
+settings with sufficient student interaction, it is not possible to scale this
+up to big data. Therefore, we leverage advances in Machine Learning and AI
+methods to perform scalable strategy prediction that is also fair to students
+at all skill levels. Specifically, we develop an embedding called MVec where we
+learn a representation based on the mastery of students. We then cluster these
+embeddings with a non-parametric clustering method where we progressively learn
+clusters such that we group together instances that have approximately
+symmetrical strategies. The strategy prediction model is trained on instances
+sampled from these clusters. This ensures that we train the model over diverse
+strategies and also that strategies from a particular group do not bias the DNN
+model, thus allowing it to optimize its parameters over all groups. Using real
+world large-scale student interaction datasets from MATHia, we implement our
+approach using transformers and Node2Vec for learning the mastery embeddings
+and LSTMs for predicting strategies. We show that our approach can scale up to
+achieve high accuracy by training on a small sample of a large dataset and also
+has predictive equality, i.e., it can predict strategies equally well for
+learners at diverse skill levels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures Published as a full paper in the 16th
+  International Conference on Educational Data Mining 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Benchmark Creation for Table Union Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koyena Pal, Aamod Khatiwada, Roee Shraga, Renée J. Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data management has traditionally relied on synthetic data generators to
+generate structured benchmarks, like the TPC suite, where we can control
+important parameters like data size and its distribution precisely. These
+benchmarks were central to the success and adoption of database management
+systems. But more and more, data management problems are of a semantic nature.
+An important example is finding tables that can be unioned. While any two
+tables with the same cardinality can be unioned, table union search is the
+problem of finding tables whose union is semantically coherent. Semantic
+problems cannot be benchmarked using synthetic data. Our current methods for
+creating benchmarks involve the manual curation and labeling of real data.
+These methods are not robust or scalable and perhaps more importantly, it is
+not clear how robust the created benchmarks are. We propose to use generative
+AI models to create structured data benchmarks for table union search. We
+present a novel method for using generative models to create tables with
+specified properties. Using this method, we create a new benchmark containing
+pairs of tables that are both unionable and non-unionable but related. We
+thoroughly evaluate recent existing table union search methods over existing
+benchmarks and our new benchmark. We also present and evaluate a new table
+search methods based on recent large language models over all benchmarks. We
+show that the new benchmark is more challenging for all methods than
+hand-curated benchmarks, specifically, the top-performing method achieves a
+Mean Average Precision of around 60%, over 30% less than its performance on
+existing manually created benchmarks. We examine why this is the case and show
+that the new benchmark permits more detailed analysis of methods, including a
+study of both false positives and false negatives that were not possible with
+existing benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Generalization in Offline Reinforcement Learning via Unseen
+  State Augmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirbhay Modhe, Qiaozi Gao, Ashwin Kalyan, Dhruv Batra, Govind Thattai, Gaurav Sukhatme
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) methods strike a balance between
+exploration and exploitation by conservative value estimation -- penalizing
+values of unseen states and actions. Model-free methods penalize values at all
+unseen actions, while model-based methods are able to further exploit unseen
+states via model rollouts. However, such methods are handicapped in their
+ability to find unseen states far away from the available offline data due to
+two factors -- (a) very short rollout horizons in models due to cascading model
+errors, and (b) model rollouts originating solely from states observed in
+offline data. We relax the second assumption and present a novel unseen state
+augmentation strategy to allow exploitation of unseen states where the learned
+model and value estimates generalize. Our strategy finds unseen states by
+value-informed perturbations of seen states followed by filtering out states
+with epistemic uncertainty estimates too high (high error) or too low (too
+similar to seen data). We observe improved performance in several offline RL
+tasks and find that our augmentation strategy consistently leads to overall
+lower average dataset Q-value estimates i.e. more conservative Q-value
+estimates than a baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating and Explaining Large Language Models for Code Using Syntactic
+  Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David N Palacio, Alejandro Velasco, Daniel Rodriguez-Cardenas, Kevin Moran, Denys Poshyvanyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) for code are a family of high-parameter,
+transformer-based neural networks pre-trained on massive datasets of both
+natural and programming languages. These models are rapidly being employed in
+commercial AI-based developer tools, such as GitHub CoPilot. However, measuring
+and explaining their effectiveness on programming tasks is a challenging
+proposition, given their size and complexity. The methods for evaluating and
+explaining LLMs for code are inextricably linked. That is, in order to explain
+a model's predictions, they must be reliably mapped to fine-grained,
+understandable concepts. Once this mapping is achieved, new methods for
+detailed model evaluations are possible. However, most current explainability
+techniques and evaluation benchmarks focus on model robustness or individual
+task performance, as opposed to interpreting model predictions.
+  To this end, this paper introduces ASTxplainer, an explainability method
+specific to LLMs for code that enables both new methods for LLM evaluation and
+visualizations of LLM predictions that aid end-users in understanding model
+predictions. At its core, ASTxplainer provides an automated method for aligning
+token predictions with AST nodes, by extracting and aggregating normalized
+model logits within AST structures. To demonstrate the practical benefit of
+ASTxplainer, we illustrate the insights that our framework can provide by
+performing an empirical evaluation on 12 popular LLMs for code using a curated
+dataset of the most popular GitHub projects. Additionally, we perform a user
+study examining the usefulness of an ASTxplainer-derived visualization of model
+predictions aimed at enabling model users to explain predictions. The results
+of these studies illustrate the potential for ASTxplainer to provide insights
+into LLM effectiveness, and aid end-users in understanding predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Equivalence of e-Commerce Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mandal, Daniel Tunkelang, Zhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query variation poses a challenge in e-commerce search, as equivalent
+search intents can be expressed through different queries with surface-level
+differences. This paper introduces a framework to recognize and leverage query
+equivalence to enhance searcher and business outcomes. The proposed approach
+addresses three key problems: mapping queries to vector representations of
+search intent, identifying nearest neighbor queries expressing equivalent or
+similar intent, and optimizing for user or business objectives. The framework
+utilizes both surface similarity and behavioral similarity to determine query
+equivalence. Surface similarity involves canonicalizing queries based on word
+inflection, word order, compounding, and noise words. Behavioral similarity
+leverages historical search behavior to generate vector representations of
+query intent. An offline process is used to train a sentence similarity model,
+while an online nearest neighbor approach supports processing of unseen
+queries. Experimental evaluations demonstrate the effectiveness of the proposed
+approach, outperforming popular sentence transformer models and achieving a
+Pearson correlation of 0.85 for query similarity. The results highlight the
+potential of leveraging historical behavior data and training models to
+recognize and utilize query equivalence in e-commerce search, leading to
+improved user experiences and business outcomes. Further advancements and
+benchmark datasets are encouraged to facilitate the development of solutions
+for this critical problem in the e-commerce domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting <span class="highlight-title">Prompt</span> Engineering via Declarative Crowdsourcing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya G. Parameswaran, Shreya Shankar, Parth Asawa, Naman Jain, Yujie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are incredibly powerful at comprehending and
+generating data in the form of text, but are brittle and error-prone. There has
+been an advent of toolkits and recipes centered around so-called prompt
+engineering-the process of asking an LLM to do something via a series of
+prompts. However, for LLM-powered data processing workflows, in particular,
+optimizing for quality, while keeping cost bounded, is a tedious, manual
+process. We put forth a vision for declarative prompt engineering. We view LLMs
+like crowd workers and leverage ideas from the declarative crowdsourcing
+literature-including leveraging multiple prompting strategies, ensuring
+internal consistency, and exploring hybrid-LLM-non-LLM approaches-to make
+prompt engineering a more principled process. Preliminary case studies on
+sorting, entity resolution, and imputation demonstrate the promise of our
+approach
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search Engine and Recommendation System for the Music Industry built
+  with JinaAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishita Gopalakrishnan, Sanjjushri Varshini R, Ponshriharini V
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most intriguing debates regarding a novel task is the development
+of search engines and recommendation-based systems in the music industry.
+Studies have shown a drastic depression in the search engine fields, due to
+concerning factors such as speed, accuracy and the format of data given for
+querying. Often people face difficulty in searching for a song solely based on
+the title, hence a solution is proposed to complete a search analysis through a
+single query input and is matched with the lyrics of the songs present in the
+database. Hence it is essential to incorporate cutting-edge technology tools
+for developing a user-friendly search engine. Jina AI is an MLOps framework for
+building neural search engines that are utilized, in order for the user to
+obtain accurate results. Jina AI effectively helps to maintain and enhance the
+quality of performance for the search engine for the query given. An effective
+search engine and a recommendation system for the music industry, built with
+JinaAI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Do Anything Now": Characterizing and Evaluating In-The-Wild Jailbreak
+  <span class="highlight-title">Prompt</span>s on Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Shen, Zeyuan Chen, Michael Backes, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The misuse of large language models (LLMs) has garnered significant attention
+from the general public and LLM vendors. In response, efforts have been made to
+align LLMs with human values and intent use. However, a particular type of
+adversarial prompts, known as jailbreak prompt, has emerged and continuously
+evolved to bypass the safeguards and elicit harmful content from LLMs. In this
+paper, we conduct the first measurement study on jailbreak prompts in the wild,
+with 6,387 prompts collected from four platforms over six months. Leveraging
+natural language processing technologies and graph-based community detection
+methods, we discover unique characteristics of jailbreak prompts and their
+major attack strategies, such as prompt injection and privilege escalation. We
+also observe that jailbreak prompts increasingly shift from public platforms to
+private ones, posing new challenges for LLM vendors in proactive detection. To
+assess the potential harm caused by jailbreak prompts, we create a question set
+comprising 46,800 samples across 13 forbidden scenarios. Our experiments show
+that current LLMs and safeguards cannot adequately defend jailbreak prompts in
+all scenarios. Particularly, we identify two highly effective jailbreak prompts
+which achieve 0.99 attack success rates on ChatGPT (GPT-3.5) and GPT-4, and
+they have persisted online for over 100 days. Our work sheds light on the
+severe and evolving threat landscape of jailbreak prompts. We hope our study
+can facilitate the research community and LLM vendors in promoting safer and
+regulated LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributionally Robust Classification on a Data Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Feuer, Ameya Joshi, Minh Pham, Chinmay Hegde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real world uses of deep learning require predictable model behavior under
+distribution shifts. Models such as CLIP show emergent natural distributional
+robustness comparable to humans, but may require hundreds of millions of
+training samples. Can we train robust learners in a domain where data is
+limited? To rigorously address this question, we introduce JANuS (Joint
+Annotations and Names Set), a collection of four new training datasets with
+images, labels, and corresponding captions, and perform a series of carefully
+controlled investigations of factors contributing to robustness in image
+classification, then compare those results to findings derived from a
+large-scale meta-analysis. Using this approach, we show that standard ResNet-50
+trained with the cross-entropy loss on 2.4 million image samples can attain
+comparable robustness to a CLIP ResNet-50 trained on 400 million samples. To
+our knowledge, this is the first result showing (near) state-of-the-art
+distributional robustness on limited data budgets. Our dataset is available at
+\url{https://huggingface.co/datasets/penfever/JANuS_dataset}, and the code used
+to reproduce our experiments can be found at
+\url{https://github.com/penfever/vlhub/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR 2023; openreview link:
+  https://openreview.net/forum?id=D5Z2E8CNsD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum algorithms applied to satellite mission planning for Earth
+  observation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07181v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07181v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serge Rainjonneau, Igor Tokarev, Sergei Iudin, Saaketh Rayaprolu, Karan Pinto, Daria Lemtiuzhnikova, Miras Koblan, Egor Barashov, Mo Kordzanganeh, Markus Pflitsch, Alexey Melnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Earth imaging satellites are a crucial part of our everyday lives that enable
+global tracking of industrial activities. Use cases span many applications,
+from weather forecasting to digital maps, carbon footprint tracking, and
+vegetation monitoring. However, there are limitations; satellites are difficult
+to manufacture, expensive to maintain, and tricky to launch into orbit.
+Therefore, satellites must be employed efficiently. This poses a challenge
+known as the satellite mission planning problem, which could be computationally
+prohibitive to solve on large scales. However, close-to-optimal algorithms,
+such as greedy reinforcement learning and optimization algorithms, can often
+provide satisfactory resolutions. This paper introduces a set of quantum
+algorithms to solve the mission planning problem and demonstrate an advantage
+over the classical algorithms implemented thus far. The problem is formulated
+as maximizing the number of high-priority tasks completed on real datasets
+containing thousands of tasks and multiple satellites. This work demonstrates
+that through solution-chaining and clustering, optimization and machine
+learning algorithms offer the greatest potential for optimal solutions. This
+paper notably illustrates that a hybridized quantum-enhanced reinforcement
+learning agent can achieve a completion percentage of 98.5% over high-priority
+tasks, significantly improving over the baseline greedy methods with a
+completion rate of 75.8%. The results presented in this work pave the way to
+quantum-enabled solutions in the space industry and, more generally, future
+mission planning problems across industries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLEDA -- Lifelong <span class="highlight-title">Self-Supervised</span> Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09027v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09027v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mamatha Thota, Dewei Yi, Georgios Leontidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans and animals have the ability to continuously learn new information
+over their lifetime without losing previously acquired knowledge. However,
+artificial neural networks struggle with this due to new information
+conflicting with old knowledge, resulting in catastrophic forgetting. The
+complementary learning systems (CLS) theory suggests that the interplay between
+hippocampus and neocortex systems enables long-term and efficient learning in
+the mammalian brain, with memory replay facilitating the interaction between
+these two systems to reduce forgetting. The proposed Lifelong Self-Supervised
+Domain Adaptation (LLEDA) framework draws inspiration from the CLS theory and
+mimics the interaction between two networks: a DA network inspired by the
+hippocampus that quickly adjusts to changes in data distribution and an SSL
+network inspired by the neocortex that gradually learns domain-agnostic general
+representations. LLEDA's latent replay technique facilitates communication
+between these two networks by reactivating and replaying the past memory latent
+representations to stabilise long-term generalisation and retention without
+interfering with the previously learned information. Extensive experiments
+demonstrate that the proposed method outperforms several other methods
+resulting in a long-term adaptation while being less prone to catastrophic
+forgetting when transferred to new domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures, 6 tables; V2 added more experiments on more
+  domains and fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenFlamingo: An Open-Source Framework for Training Large Autoregressive
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01390v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01390v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OpenFlamingo, a family of autoregressive vision-language models
+ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce
+an open-source replication of DeepMind's Flamingo models. On seven
+vision-language datasets, OpenFlamingo models average between 80 - 89% of
+corresponding Flamingo performance. This technical report describes our models,
+training data, hyperparameters, and evaluation suite. We share our models and
+code at https://github.com/mlfoundations/open_flamingo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning with Language Model <span class="highlight-title">Prompt</span>ing: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09597v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09597v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuofei Qiao, Yixin Ou, Ningyu Zhang, Xiang Chen, Yunzhi Yao, Shumin Deng, Chuanqi Tan, Fei Huang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning, as an essential ability for complex problem-solving, can provide
+back-end support for various real-world applications, such as medical
+diagnosis, negotiation, etc. This paper provides a comprehensive survey of
+cutting-edge research on reasoning with language model prompting. We introduce
+research works with comparisons and summaries and provide systematic resources
+to help beginners. We also discuss the potential reasons for emerging such
+reasoning abilities and highlight future research directions. Resources are
+available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated
+periodically).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, fixed Equation 2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Explanations: Leveraging Human Input to Align Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09656v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09656v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivian Lai, Yiming Zhang, Chacha Chen, Q. Vera Liao, Chenhao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a vast collection of explainable AI (XAI) algorithms have been
+developed in recent years, they are often criticized for significant gaps with
+how humans produce and consume explanations. As a result, current XAI
+techniques are often found to be hard to use and lack effectiveness. In this
+work, we attempt to close these gaps by making AI explanations selective -- a
+fundamental property of human explanations -- by selectively presenting a
+subset from a large set of model reasons based on what aligns with the
+recipient's preferences. We propose a general framework for generating
+selective explanations by leveraging human input on a small sample. This
+framework opens up a rich design space that accounts for different selectivity
+goals, types of input, and more. As a showcase, we use a decision-support task
+to explore selective explanations based on what the decision-maker would
+consider relevant to the decision task. We conducted two experimental studies
+to examine three out of a broader possible set of paradigms based on our
+proposed framework: in Study 1, we ask the participants to provide their own
+input to generate selective explanations, with either open-ended or
+critique-based input. In Study 2, we show participants selective explanations
+based on input from a panel of similar users (annotators). Our experiments
+demonstrate the promise of selective explanations in reducing over-reliance on
+AI and improving decision outcomes and subjective perceptions of the AI, but
+also paint a nuanced picture that attributes some of these positive effects to
+the opportunity to provide one's own input to augment AI explanations. Overall,
+our work proposes a novel XAI framework inspired by human communication
+behaviors and demonstrates its potentials to encourage future work to better
+align AI explanations with human production and consumption of explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 25 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sliced Optimal Partial Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08049v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08049v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yikun Bai, Berhnard Schmitzer, Mathew Thorpe, Soheil Kolouri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal transport (OT) has become exceedingly popular in machine learning,
+data science, and computer vision. The core assumption in the OT problem is the
+equal total amount of mass in source and target measures, which limits its
+application. Optimal Partial Transport (OPT) is a recently proposed solution to
+this limitation. Similar to the OT problem, the computation of OPT relies on
+solving a linear programming problem (often in high dimensions), which can
+become computationally prohibitive. In this paper, we propose an efficient
+algorithm for calculating the OPT problem between two non-negative measures in
+one dimension. Next, following the idea of sliced OT distances, we utilize
+slicing to define the sliced OPT distance. Finally, we demonstrate the
+computational and accuracy benefits of the sliced OPT-based method in various
+numerical experiments. In particular, we show an application of our proposed
+Sliced-OPT in noisy point cloud registration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>modify the link of Github page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer
+  using LSTM, BiLSTM, CNN, GRU, and GloVe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14361v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14361v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham, Jamil Al Shaqsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and
+GloVe to classify gene mutations using Kaggle's Personalized Medicine:
+Redefining Cancer Treatment dataset. The results were compared against
+well-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and
+their LSTM ensembles. Our model outperformed all other models in terms of
+accuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it
+also needed less training time, resulting in a perfect combination of
+performance and efficiency. This study demonstrates the utility of ensemble
+models for difficult tasks such as gene mutation classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs Understand Glass-Box Models, Discover Surprises, and Suggest
+  Repairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01157v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01157v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin J. Lengerich, Sebastian Bordt, Harsha Nori, Mark E. Nunnally, Yin Aphinyanaphongs, Manolis Kellis, Rich Caruana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that large language models (LLMs) are remarkably good at working with
+interpretable models that decompose complex outcomes into univariate
+graph-represented components. By adopting a hierarchical approach to reasoning,
+LLMs can provide comprehensive model-level summaries without ever requiring the
+entire model to fit in context. This approach enables LLMs to apply their
+extensive background knowledge to automate common tasks in data science such as
+detecting anomalies that contradict prior knowledge, describing potential
+reasons for the anomalies, and suggesting repairs that would remove the
+anomalies. We use multiple examples in healthcare to demonstrate the utility of
+these new capabilities of LLMs, with particular emphasis on Generalized
+Additive Models (GAMs). Finally, we present the package $\texttt{TalkToEBM}$ as
+an open-source LLM-GAM interface.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Maxout Network-based Feature Fusion and Political Tangent Search
+  Optimizer enabled Transfer Learning for Thalassemia Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02029v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02029v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hemn Barzan Abdalla, Awder Ahmed, Guoquan Li, Nasser Mustafa, Abdur Rashid Sangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thalassemia is a heritable blood disorder which is the outcome of a genetic
+defect causing lack of production of hemoglobin polypeptide chains. However,
+there is less understanding of the precise frequency as well as sharing in
+these areas. Knowing about the frequency of thalassemia occurrence and
+dependable mutations is thus a significant step in preventing, controlling, and
+treatment planning. Here, Political Tangent Search Optimizer based Transfer
+Learning (PTSO_TL) is introduced for thalassemia detection. Initially, input
+data obtained from a particular dataset is normalized in the data normalization
+stage. Quantile normalization is utilized in the data normalization stage, and
+the data are then passed to the feature fusion phase, in which Weighted
+Euclidean Distance with Deep Maxout Network (DMN) is utilized. Thereafter, data
+augmentation is performed using the oversampling method to increase data
+dimensionality. Lastly, thalassemia detection is carried out by TL, wherein a
+convolutional neural network (CNN) is utilized with hyperparameters from a
+trained model such as Xception. TL is tuned by PTSO, and the training algorithm
+PTSO is presented by merging of Political Optimizer (PO) and Tangent Search
+Algorithm (TSA). Furthermore, PTSO_TL obtained maximal precision, recall, and
+f-measure values of about 94.3%, 96.1%, and 95.2%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Deterministic Policy Gradient for End-to-End Communication Systems
+  without Prior Channel Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bolun Zhang, Nguyen Van Huynh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-End (E2E) learning-based concept has been recently introduced to
+jointly optimize both the transmitter and the receiver in wireless
+communication systems. Unfortunately, this E2E learning architecture requires a
+prior differentiable channel model to jointly train the deep neural networks
+(DNNs) at the transceivers, which is hardly obtained in practice. This paper
+aims to solve this issue by developing a deep deterministic policy gradient
+(DDPG)-based framework. In particular, the proposed solution uses the loss
+value of the receiver DNN as the reward to train the transmitter DNN. The
+simulation results then show that our proposed solution can jointly train the
+transmitter and the receiver without requiring the prior channel model. In
+addition, we demonstrate that the proposed DDPG-based solution can achieve
+better detection performance compared to the state-of-the-art solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to IEEE GLOBECOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised machine-learning shock-capturing technique for high-order
+  solvers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Mateo-Gabín, Kenza Tlales, Eusebio Valero, Esteban Ferrer, Gonzalo Rubio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel unsupervised machine learning shock capturing algorithm
+based on Gaussian Mixture Models (GMMs). The proposed GMM sensor demonstrates
+remarkable accuracy in detecting shocks and is robust across diverse test cases
+without the need for parameter tuning. We compare the GMM-based sensor with
+state-of-the-art alternatives. All methods are integrated into a high-order
+compressible discontinuous Galerkin solver where artificial viscosity can be
+modulated to capture shocks. Supersonic test cases, including high Reynolds
+numbers, showcase the sensor's performance, demonstrating the same
+effectiveness as fine-tuned state-of-the-art sensors. %The nodal DG aproach
+allows for potential applications in sub-cell flux-differencing formulations,
+supersonic feature detection, and mesh refinement. The adaptive nature and
+ability to function without extensive training datasets make this GMM-based
+sensor suitable for complex geometries and varied flow configurations. Our
+study reveals the potential of unsupervised machine learning methods,
+exemplified by the GMM sensor, to improve the robustness and efficiency of
+advanced CFD codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Did we personalize? Assessing personalization by an online reinforcement
+  learning algorithm using resampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05365v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05365v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susobhan Ghosh, Raphael Kim, Prasidh Chhabria, Raaz Dwivedi, Predrag Klasnja, Peng Liao, Kelly Zhang, Susan Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing interest in using reinforcement learning (RL) to
+personalize sequences of treatments in digital health to support users in
+adopting healthier behaviors. Such sequential decision-making problems involve
+decisions about when to treat and how to treat based on the user's context
+(e.g., prior activity level, location, etc.). Online RL is a promising
+data-driven approach for this problem as it learns based on each user's
+historical responses and uses that knowledge to personalize these decisions.
+However, to decide whether the RL algorithm should be included in an
+``optimized'' intervention for real-world deployment, we must assess the data
+evidence indicating that the RL algorithm is actually personalizing the
+treatments to its users. Due to the stochasticity in the RL algorithm, one may
+get a false impression that it is learning in certain states and using this
+learning to provide specific treatments. We use a working definition of
+personalization and introduce a resampling-based methodology for investigating
+whether the personalization exhibited by the RL algorithm is an artifact of the
+RL algorithm stochasticity. We illustrate our methodology with a case study by
+analyzing the data from a physical activity clinical trial called HeartSteps,
+which included the use of an online RL algorithm. We demonstrate how our
+approach enhances data-driven truth-in-advertising of algorithm personalization
+both across all users as well as within specific users in the study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedOBD: Opportunistic Block Dropout for Efficiently Training Large-scale
+  Neural Networks through Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05174v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05174v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Chen, Zichen Chen, Pengcheng Wu, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale neural networks possess considerable expressive power. They are
+well-suited for complex learning tasks in industrial applications. However,
+large-scale models pose significant challenges for training under the current
+Federated Learning (FL) paradigm. Existing approaches for efficient FL training
+often leverage model parameter dropout. However, manipulating individual model
+parameters is not only inefficient in meaningfully reducing the communication
+overhead when training large-scale FL models, but may also be detrimental to
+the scaling efforts and model performance as shown by recent research. To
+address these issues, we propose the Federated Opportunistic Block Dropout
+(FedOBD) approach. The key novelty is that it decomposes large-scale models
+into semantic blocks so that FL participants can opportunistically upload
+quantized blocks, which are deemed to be significant towards training the
+model, to the FL server for aggregation. Extensive experiments evaluating
+FedOBD against four state-of-the-art approaches based on multiple real-world
+datasets show that it reduces the overall communication overhead by more than
+88% compared to the best performing baseline approach, while achieving the
+highest test accuracy. To the best of our knowledge, FedOBD is the first
+approach to perform dropout on FL models at the block level rather than at the
+individual parameter level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Democratising AI: Multiple Meanings, Goals, and Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12642v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12642v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizabeth Seger, Aviv Ovadya, Ben Garfinkel, Divya Siddarth, Allan Dafoe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous parties are calling for the democratisation of AI, but the phrase is
+used to refer to a variety of goals, the pursuit of which sometimes conflict.
+This paper identifies four kinds of AI democratisation that are commonly
+discussed: (1) the democratisation of AI use, (2) the democratisation of AI
+development, (3) the democratisation of AI profits, and (4) the democratisation
+of AI governance. Numerous goals and methods of achieving each form of
+democratisation are discussed. The main takeaway from this paper is that AI
+democratisation is a multifarious and sometimes conflicting concept that should
+not be conflated with improving AI accessibility. If we want to move beyond
+ambiguous commitments to democratising AI, to productive discussions of
+concrete policies and trade-offs, then we need to recognise the principal role
+of the democratisation of AI governance in navigating tradeoffs and risks
+across decisions around use, development, and profits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V2 Changed second author affiliation; added citation to section 5.2;
+  edit to author contribution statement; V3 camera ready version for conference
+  proceedings. Minor content changes in response to reviewer comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARLANE: A Lane Detection Benchmark for Unsupervised Domain Adaptation
+  from Simulation to multiple Real-World Domains <span class="chip">NeurIPS
+  2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08083v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08083v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Gebele, Bonifaz Stuhr, Johann Haselberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation demonstrates great potential to mitigate
+domain shifts by transferring models from labeled source domains to unlabeled
+target domains. While Unsupervised Domain Adaptation has been applied to a wide
+variety of complex vision tasks, only few works focus on lane detection for
+autonomous driving. This can be attributed to the lack of publicly available
+datasets. To facilitate research in these directions, we propose CARLANE, a
+3-way sim-to-real domain adaptation benchmark for 2D lane detection. CARLANE
+encompasses the single-target datasets MoLane and TuLane and the multi-target
+dataset MuLane. These datasets are built from three different domains, which
+cover diverse scenes and contain a total of 163K unique images, 118K of which
+are annotated. In addition we evaluate and report systematic baselines,
+including our own method, which builds upon Prototypical Cross-domain
+Self-supervised Learning. We find that false positive and false negative rates
+of the evaluated domain adaptation methods are high compared to those of fully
+supervised baselines. This affirms the need for benchmarks such as CARLANE to
+further strengthen research in Unsupervised Domain Adaptation for lane
+detection. CARLANE, all evaluated models and the corresponding implementations
+are publicly available at https://carlanebenchmark.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36th Conference on Neural Information Processing Systems (NeurIPS
+  2022) Track on Datasets and Benchmarks, 22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairGrad: Fairness Aware Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.10923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.10923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaurav Maheshwari, Michaël Perrot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of group fairness in classification, where the
+objective is to learn models that do not unjustly discriminate against
+subgroups of the population. Most existing approaches are limited to simple
+binary tasks or involve difficult to implement training mechanisms which
+reduces their practical applicability. In this paper, we propose FairGrad, a
+method to enforce fairness based on a re-weighting scheme that iteratively
+learns group specific weights based on whether they are advantaged or not.
+FairGrad is easy to implement, accommodates various standard fairness
+definitions, and comes with minimal overhead. Furthermore, we show that it is
+competitive with standard baselines over various datasets including ones used
+in natural language processing and computer vision.
+  FairGrad is available as a PyPI package at -
+https://pypi.org/project/fairgrad
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper is accepted at Transactions on Machine Learning Research.
+  Reviewed on OpenReview: https://openreview.net/forum?id=0f8tU3QwWD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Unlearning of Features and Labels <span class="chip">NDSS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.11577v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.11577v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Warnecke, Lukas Pirch, Christian Wressnegger, Konrad Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Removing information from a machine learning model is a non-trivial task that
+requires to partially revert the training process. This task is unavoidable
+when sensitive data, such as credit card numbers or passwords, accidentally
+enter the model and need to be removed afterwards. Recently, different concepts
+for machine unlearning have been proposed to address this problem. While these
+approaches are effective in removing individual data points, they do not scale
+to scenarios where larger groups of features and labels need to be reverted. In
+this paper, we propose the first method for unlearning features and labels. Our
+approach builds on the concept of influence functions and realizes unlearning
+through closed-form updates of model parameters. It enables to adapt the
+influence of training data on a learning model retrospectively, thereby
+correcting data leaks and privacy issues. For learning models with strongly
+convex loss functions, our method provides certified unlearning with
+theoretical guarantees. For models with non-convex losses, we empirically show
+that unlearning features and labels is effective and significantly faster than
+other strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Network and Distributed System Security Symposium (NDSS) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table<span class="highlight-title">GPT</span>: Towards Unifying Tables, Nature Language and Commands into One
+  <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08674v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08674v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangyu Zha, Junlin Zhou, Liyao Li, Rui Wang, Qingyi Huang, Saisai Yang, Jing Yuan, Changbao Su, Xiang Li, Aofeng Su, Tao Zhang, Chen Zhou, Kaizhe Shou, Miao Wang, Wufang Zhu, Guoshan Lu, Chao Ye, Yali Ye, Wentao Ye, Yiming Zhang, Xinglong Deng, Jie Xu, Haobo Wang, Gang Chen, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tables are prevalent in real-world databases, requiring significant time and
+effort for humans to analyze and manipulate. The advancements in large language
+models (LLMs) have made it possible to interact with tables using natural
+language input, bringing this capability closer to reality. In this paper, we
+present TableGPT, a unified fine-tuned framework that enables LLMs to
+understand and operate on tables using external functional commands. It
+introduces the capability to seamlessly interact with tables, enabling a wide
+range of functionalities such as question answering, data manipulation (e.g.,
+insert, delete, query, and modify operations), data visualization, analysis
+report generation, and automated prediction. TableGPT aims to provide
+convenience and accessibility to users by empowering them to effortlessly
+leverage tabular data. At the core of TableGPT lies the novel concept of global
+tabular representations, which empowers LLMs to gain a comprehensive
+understanding of the entire table beyond meta-information. By jointly training
+LLMs on both table and text modalities, TableGPT achieves a deep understanding
+of tabular data and the ability to perform complex operations on tables through
+chain-of-command instructions. Importantly, TableGPT offers the advantage of
+being a self-contained system rather than relying on external API interfaces.
+Moreover, it supports efficient data process flow, query rejection (when
+appropriate) and private deployment, enabling faster domain data fine-tuning
+and ensuring data privacy, which enhances the framework's adaptability to
+specific use cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Within-Group Fairness of Screening Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00025v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00025v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nastaran Okati, Stratis Tsirtsis, Manuel Gomez Rodriguez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Screening classifiers are increasingly used to identify qualified candidates
+in a variety of selection processes. In this context, it has been recently
+shown that, if a classifier is calibrated, one can identify the smallest set of
+candidates which contains, in expectation, a desired number of qualified
+candidates using a threshold decision rule. This lends support to focusing on
+calibration as the only requirement for screening classifiers. In this paper,
+we argue that screening policies that use calibrated classifiers may suffer
+from an understudied type of within-group unfairness -- they may unfairly treat
+qualified members within demographic groups of interest. Further, we argue that
+this type of unfairness can be avoided if classifiers satisfy within-group
+monotonicity, a natural monotonicity property within each of the groups. Then,
+we introduce an efficient post-processing algorithm based on dynamic
+programming to minimally modify a given calibrated classifier so that its
+probability estimates satisfy within-group monotonicity. We validate our
+algorithm using US Census survey data and show that within-group monotonicity
+can be often achieved at a small cost in terms of prediction granularity and
+shortlist size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Memory- and Time-Efficient Backpropagation for Training Spiking
+  Neural Networks <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14311v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14311v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyan Meng, Mingqing Xiao, Shen Yan, Yisen Wang, Zhouchen Lin, Zhi-Quan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking Neural Networks (SNNs) are promising energy-efficient models for
+neuromorphic computing. For training the non-differentiable SNN models, the
+backpropagation through time (BPTT) with surrogate gradients (SG) method has
+achieved high performance. However, this method suffers from considerable
+memory cost and training time during training. In this paper, we propose the
+Spatial Learning Through Time (SLTT) method that can achieve high performance
+while greatly improving training efficiency compared with BPTT. First, we show
+that the backpropagation of SNNs through the temporal domain contributes just a
+little to the final calculated gradients. Thus, we propose to ignore the
+unimportant routes in the computational graph during backpropagation. The
+proposed method reduces the number of scalar multiplications and achieves a
+small memory occupation that is independent of the total time steps.
+Furthermore, we propose a variant of SLTT, called SLTT-K, that allows
+backpropagation only at K time steps, then the required number of scalar
+multiplications is further reduced and is independent of the total time steps.
+Experiments on both static and neuromorphic datasets demonstrate superior
+training efficiency and performance of our SLTT. In particular, our method
+achieves state-of-the-art accuracy on ImageNet, while the memory cost and
+training time are reduced by more than 70% and 50%, respectively, compared with
+BPTT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A novel Deep Learning approach for one-step Conformal Prediction
+  approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12377v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12377v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julia A. Meister, Khuong An Nguyen, Stelios Kapetanakis, Zhiyuan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning predictions with measurable confidence are increasingly
+desirable for real-world problems, especially in high-risk settings. The
+Conformal Prediction (CP) framework is a versatile solution that guarantees a
+maximum error rate given minimal constraints. In this paper, we propose a novel
+conformal loss function that approximates the traditionally two-step CP
+approach in a single step. By evaluating and penalising deviations from the
+stringent expected CP output distribution, a Deep Learning model may learn the
+direct relationship between the input data and the conformal p-values. We carry
+out a comprehensive empirical evaluation to show our novel loss function's
+competitiveness for seven binary and multi-class prediction tasks on five
+benchmark datasets. On the same datasets, our approach achieves significant
+training time reductions up to 86% compared to Aggregated Conformal Prediction
+(ACP), while maintaining comparable approximate validity and predictive
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 15 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Interpretable and Explainable Deep Learning Models for Brain
+  Tumor MRI and COVID-19 Chest X-ray Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuf Brima, Marcellin Atemkeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning shows promise for medical image analysis but lacks
+interpretability, hindering adoption in healthcare. Attribution techniques that
+explain model reasoning may increase trust in deep learning among clinical
+stakeholders. This paper aimed to evaluate attribution methods for illuminating
+how deep neural networks analyze medical images. Using adaptive path-based
+gradient integration, we attributed predictions from brain tumor MRI and
+COVID-19 chest X-ray datasets made by recent deep convolutional neural network
+models. The technique highlighted possible biomarkers, exposed model biases,
+and offered insights into the links between input and prediction. Our analysis
+demonstrates the method's ability to elucidate model reasoning on these
+datasets. The resulting attributions show promise for improving deep learning
+transparency for domain experts by revealing the rationale behind predictions.
+This study advances model interpretability to increase trust in deep learning
+among healthcare stakeholders.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Spectral Regularization for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03345v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03345v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Frascaroli, Riccardo Benaglia, Matteo Boschini, Luca Moschella, Cosimo Fiorini, Emanuele Rodolà, Simone Calderara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While biological intelligence grows organically as new knowledge is gathered
+throughout life, Artificial Neural Networks forget catastrophically whenever
+they face a changing training data distribution. Rehearsal-based Continual
+Learning (CL) approaches have been established as a versatile and reliable
+solution to overcome this limitation; however, sudden input disruptions and
+memory constraints are known to alter the consistency of their predictions. We
+study this phenomenon by investigating the geometric characteristics of the
+learner's latent space and find that replayed data points of different classes
+increasingly mix up, interfering with classification. Hence, we propose a
+geometric regularizer that enforces weak requirements on the Laplacian spectrum
+of the latent space, promoting a partitioning behavior. We show that our
+proposal, called Continual Spectral Regularizer (CaSpeR), can be easily
+combined with any rehearsal-based CL approach and improves the performance of
+SOTA methods on standard benchmarks. Finally, we conduct additional analysis to
+provide insights into CaSpeR's effects and applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Batches Stabilize the Minimum Norm Risk in High Dimensional
+  Overparameterized Linear Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08432v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08432v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahar Stein Ioushua, Inbar Hasidim, Ofer Shayevitz, Meir Feder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning algorithms that divide the data into batches are prevalent in many
+machine-learning applications, typically offering useful trade-offs between
+computational efficiency and performance. In this paper, we examine the
+benefits of batch-partitioning through the lens of a minimum-norm
+overparameterized linear regression model with isotropic Gaussian features. We
+suggest a natural small-batch version of the minimum-norm estimator, and derive
+an upper bound on its quadratic risk, showing it is inversely proportional to
+the noise level as well as to the overparameterization ratio, for the optimal
+choice of batch size. In contrast to minimum-norm, our estimator admits a
+stable risk behavior that is monotonically increasing in the
+overparameterization ratio, eliminating both the blowup at the interpolation
+point and the double-descent phenomenon. Interestingly, we observe that this
+implicit regularization offered by the batch partition is partially explained
+by feature overlap between the batches. Our bound is derived via a novel
+combination of techniques, in particular normal approximation in the
+Wasserstein metric of noisy projections over random subspaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>55 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Radiology Report Generation by Learning with Increasingly Hard
+  Negatives <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhanu Prakash Voutharoja, Lei Wang, Luping Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radiology report generation is challenging as medical images or
+reports are usually similar to each other due to the common content of anatomy.
+This makes a model hard to capture the uniqueness of individual images and is
+prone to producing undesired generic or mismatched reports. This situation
+calls for learning more discriminative features that could capture even
+fine-grained mismatches between images and reports. To achieve this, this paper
+proposes a novel framework to learn discriminative image and report features by
+distinguishing them from their closest peers, i.e., hard negatives. Especially,
+to attain more discriminative features, we gradually raise the difficulty of
+such a learning task by creating increasingly hard negative reports for each
+image in the feature space during training, respectively. By treating the
+increasingly hard negatives as auxiliary variables, we formulate this process
+as a min-max alternating optimisation problem. At each iteration, conditioned
+on a given set of hard negative reports, image and report features are learned
+as usual by minimising the loss functions related to report generation. After
+that, a new set of harder negative reports will be created by maximising a loss
+reflecting image-report alignment. By solving this optimisation, we attain a
+model that can generate more specific and accurate reports. It is noteworthy
+that our framework enhances discriminative feature learning without introducing
+extra network weights. Also, in contrast to the existing way of generating hard
+negatives, our framework extends beyond the granularity of the dataset by
+generating harder samples out of the training set. Experimental study on
+benchmark datasets verifies the efficacy of our framework and shows that it can
+serve as a plug-in to readily improve existing medical report generation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to European Conference on Artificial Intelligence (ECAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symmetry & Critical Points for Symmetric Tensor Decomposition Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07886v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07886v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Arjevani, Gal Vinograd
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the nonconvex optimization problem associated with the
+decomposition of a real symmetric tensor into a sum of rank one terms. Use is
+made of the rich symmetry structure to construct infinite families of critical
+points represented by Puiseux series in the problem dimension, and so obtain
+precise analytic estimates on the value of the objective function and the
+Hessian spectrum. The results allow an analytic characterization of various
+obstructions to using local optimization methods, revealing in particular a
+complex array of saddles and minima differing by their symmetry, structure and
+analytic properties. A~desirable phenomenon, occurring for all critical points
+considered, concerns the number of negative Hessian eigenvalues increasing with
+the value of the objective function. Our approach makes use of Newton polyhedra
+as well as results from real algebraic geometry, notably the Curve Selection
+Lemma, to determine the extremal character of degenerate critical points,
+establishing in particular the existence of infinite families of third-order
+saddles which can significantly slow down the optimization process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quadruple-star systems are not always nested triples: a machine learning
+  approach to dynamical stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavan Vynatheya, Rosemary A. Mardling, Adrian S. Hamers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dynamical stability of quadruple-star systems has traditionally been
+treated as a problem involving two `nested' triples which constitute a
+quadruple. In this novel study, we employed a machine learning algorithm, the
+multi-layer perceptron (MLP), to directly classify 2+2 and 3+1 quadruples based
+on their stability (or long-term boundedness). The training data sets for the
+classification, comprised of $5\times10^5$ quadruples each, were integrated
+using the highly accurate direct $N$-body code MSTAR. We also carried out a
+limited parameter space study of zero-inclination systems to directly compare
+quadruples to triples. We found that both our quadruple MLP models perform
+better than a `nested' triple MLP approach, which is especially significant for
+3+1 quadruples. The classification accuracies for the 2+2 MLP and 3+1 MLP
+models are 94% and 93% respectively, while the scores for the `nested' triple
+approach are 88% and 66% respectively. This is a crucial implication for
+quadruple population synthesis studies. Our MLP models, which are very simple
+and almost instantaneous to implement, are available on GitHub, along with
+Python3 scripts to access them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by MNRAS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Deep Generative Models with Generalized Empirical
+  Likelihoods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suman Ravuri, Mélanie Rey, Shakir Mohamed, Marc Deisenroth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how well a deep generative model captures a distribution of
+high-dimensional data remains an important open challenge. It is especially
+difficult for certain model classes, such as Generative Adversarial Networks
+and Diffusion Models, whose models do not admit exact likelihoods. In this
+work, we demonstrate that generalized empirical likelihood (GEL) methods offer
+a family of diagnostic tools that can identify many deficiencies of deep
+generative models (DGMs). We show, with appropriate specification of moment
+conditions, that the proposed method can identify which modes have been
+dropped, the degree to which DGMs are mode imbalanced, and whether DGMs
+sufficiently capture intra-class diversity. We show how to combine techniques
+from Maximum Mean Discrepancy and Generalized Empirical Likelihood to create
+not only distribution tests that retain per-sample interpretability, but also
+metrics that include label information. We find that such tests predict the
+degree of mode dropping and mode imbalance up to 60% better than metrics such
+as improved precision/recall. We provide an implementation at
+https://github.com/deepmind/understanding_deep_generative_models_with_generalized_empirical_likelihood/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Pattern Recognition 2023 (Highlight, top 2.6% of
+  submissions)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaloFlow for CaloChallenge <span class="highlight-title">Dataset</span> 1 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudius Krause, Ian Pang, David Shih
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CaloFlow is a new and promising approach to fast calorimeter simulation based
+on normalizing flows. Applying CaloFlow to the photon and charged pion Geant4
+showers of Dataset 1 of the Fast Calorimeter Simulation Challenge 2022, we show
+how it can produce high-fidelity samples with a sampling time that is several
+orders of magnitude faster than Geant4. We demonstrate the fidelity of the
+samples using calorimeter shower images, histograms of high-level features, and
+aggregate metrics such as a classifier trained to distinguish CaloFlow from
+Geant4 samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 18 figures, v2: updated pion evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Delay-Aware Hierarchical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12414v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12414v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Po-Chen Lin, Seyyedali Hosseinalipour, Nicolò Michelusi, Christopher Brinton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning has gained popularity as a means of training models
+distributed across the wireless edge. The paper introduces delay-aware
+hierarchical federated learning (DFL) to improve the efficiency of distributed
+machine learning (ML) model training by accounting for communication delays
+between edge and cloud. Different from traditional federated learning, DFL
+leverages multiple stochastic gradient descent iterations on device datasets
+within each global aggregation period and intermittently aggregates model
+parameters through edge servers in local subnetworks. During global
+synchronization, the cloud server consolidates local models with the outdated
+global model using a local-global combiner, thus preserving crucial elements of
+both, enhancing learning efficiency under the presence of delay. A set of
+conditions is obtained to achieve the sub-linear convergence rate of O(1/k).
+Based on these findings, an adaptive control algorithm is developed for DFL,
+implementing policies to mitigate energy consumption and communication latency
+while aiming for a sublinear convergence rate. Numerical evaluations show DFL's
+superior performance in terms of faster global model convergence, reduced
+resource consumption, and robustness against communication delays compared to
+existing FL algorithms. In summary, this proposed method offers improved
+efficiency and results when dealing with both convex and non-convex loss
+functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A condensed version of this paper was presented at IEEE Globecom 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling the Curse of Dimensionality with Physics-Informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Hu, Khemraj Shukla, George Em Karniadakis, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The curse-of-dimensionality (CoD) taxes computational resources heavily with
+exponentially increasing computational cost as the dimension increases. This
+poses great challenges in solving high-dimensional PDEs as Richard Bellman
+first pointed out over 60 years ago. While there has been some recent success
+in solving numerically partial differential equations (PDEs) in high
+dimensions, such computations are prohibitively expensive, and true scaling of
+general nonlinear PDEs to high dimensions has never been achieved. In this
+paper, we develop a new method of scaling up physics-informed neural networks
+(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called
+Stochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs
+into pieces corresponding to different dimensions and samples randomly a subset
+of these dimensional pieces in each iteration of training PINNs. We
+theoretically prove the convergence guarantee and other desired properties of
+the proposed method. We experimentally demonstrate that the proposed method
+allows us to solve many notoriously hard high-dimensional PDEs, including the
+Hamilton-Jacobi-Bellman (HJB) and the Schr\"{o}dinger equations in thousands of
+dimensions very fast on a single GPU using the PINNs mesh-free approach. For
+instance, we solve nontrivial nonlinear PDEs (one HJB equation and one
+Black-Scholes equation) in 100,000 dimensions in 6 hours on a single GPU using
+SDGD with PINNs. Since SDGD is a general training methodology of PINNs, SDGD
+can be applied to any current and future variants of PINNs to scale them up for
+arbitrary high-dimensional PDEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual
+  Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morris Alper, Hadar Averbuch-Elor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactions between humans are diverse and context-dependent, but previous
+works have treated them as categorical, disregarding the heavy tail of possible
+interactions. We propose a new paradigm of learning human-human interactions as
+free text from a single still image, allowing for flexibility in modeling the
+unlimited space of situations and relationships between people. To overcome the
+absence of data labelled specifically for this task, we use knowledge
+distillation applied to synthetic caption data produced by a large language
+model without explicit supervision. We show that the pseudo-labels produced by
+this procedure can be used to train a captioning model to effectively
+understand human-human interactions in images, as measured by a variety of
+metrics that measure textual and semantic faithfulness and factual groundedness
+of our predictions. We further show that our approach outperforms SOTA image
+captioning and situation recognition models on this task. We will release our
+code and pseudo-labels along with Waldo and Wenda, a manually-curated test set
+for still image human-human interaction understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at ICCV 2023. Project webpage:
+  https://learning-interactions.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Survival Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Archetti, Matteo Matteucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival analysis is a subfield of statistics concerned with modeling the
+occurrence time of a particular event of interest for a population. Survival
+analysis found widespread applications in healthcare, engineering, and social
+sciences. However, real-world applications involve survival datasets that are
+distributed, incomplete, censored, and confidential. In this context, federated
+learning can tremendously improve the performance of survival analysis
+applications. Federated learning provides a set of privacy-preserving
+techniques to jointly train machine learning models on multiple datasets
+without compromising user privacy, leading to a better generalization
+performance. However, despite the widespread development of federated learning
+in recent AI research, few studies focus on federated survival analysis. In
+this work, we present a novel federated algorithm for survival analysis based
+on one of the most successful survival models, the random survival forest. We
+call the proposed method Federated Survival Forest (FedSurF). With a single
+communication round, FedSurF obtains a discriminative power comparable to
+deep-learning-based federated models trained over hundreds of federated
+iterations. Moreover, FedSurF retains all the advantages of random forests,
+namely low computational cost and natural handling of missing values and
+incomplete datasets. These advantages are especially desirable in real-world
+federated environments with multiple small datasets stored on devices with low
+computational capabilities. Numerical experiments compare FedSurF with
+state-of-the-art survival models in federated networks, showing how FedSurF
+outperforms deep-learning-based federated algorithms in realistic environments
+with non-identically distributed data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DNA<span class="highlight-title">GPT</span>: A Generalized <span class="highlight-title">Pre-train</span>ed Tool for Versatile DNA Sequence
+  Analysis Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daoan Zhang, Weitong Zhang, Bing He, Yu Zhao, Jianguo Zhang, Chenchen Qin, Jianhua Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GPT has been proven to be capable of extracting general information from
+language sequences, thereby benefiting all downstream tasks. This motivates us
+to use pre-trained models to explore the hidden inherent information in DNA
+sequences. However, data and task requirements in DNA sequence analyses are
+tasked in different formats such as generation, prediction and regression, and
+are complexity and involve different modalities, such as nucleotides sequences
+and, expression levels, etc. Existing BERT-based models are mostly for
+generation tasks and use sequence data as input and output, thus cannot easily
+handle various DNA analysis tasks in one single model. Herein, we propose a
+generalized DNA pre-training DNA model, DNAGPT, that was trained on over 200
+billion base pairs from all the mammals. We enhance the classic GPT model by
+adding binary classification task (DNA sequence order) and numerical regression
+task (guanine-cytosine content prediction) in the pre-training period and
+enhancing the architecture with corresponding embedding layers and encoding
+heads. We also design a comprehensive token language to encode sequence, number
+and task related information in the same token space. Therefore, DNAGPT can
+handle versatile DNA analysis tasks and simultaneously process handle both
+sequence and numerical data. We have evaluated our model on genomic signals and
+regions recognition, pseudo genomes generation and mRNA abudance regression
+tasks. We demonstrate that benefiting from pre-training, DNAGPT can shows
+superior performance than the existing models specially designed for various
+downstreams tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HybMT: Hybrid Meta-Predictor based ML Algorithm for Fast Test Vector
+  Generation <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11312v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11312v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shruti Pandey,  Jayadeva, Smruti R. Sarangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ML models are increasingly being used to increase the test coverage and
+decrease the overall testing time. This field is still in its nascent stage and
+up till now there were no algorithms that could match or outperform commercial
+tools in terms of speed and accuracy for large circuits. We propose an ATPG
+algorithm HybMT in this paper that finally breaks this barrier. Like sister
+methods, we augment the classical PODEM algorithm that uses recursive
+backtracking. We design a custom 2-level predictor that predicts the input net
+of a logic gate whose value needs to be set to ensure that the output is a
+given value (0 or 1). Our predictor chooses the output from among two
+first-level predictors, where the most effective one is a bespoke neural
+network and the other is an SVM regressor. As compared to a popular,
+state-of-the-art commercial ATPG tool, HybMT shows an overall reduction of
+56.6% in the CPU time without compromising on the fault coverage for the EPFL
+benchmark circuits. HybMT also shows a speedup of 126.4% over the best ML-based
+algorithm while obtaining an equal or better fault coverage for the EPFL
+benchmark circuits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures and 5 tables. Changes from the previous version:
+  We modified our novel neural network model "HybNN" with a skip connection and
+  found a significant improvement in the fault coverage and runtime of our
+  HybMT-based PODEM algorithm. We train on the smaller ISCAS'85 circuits,
+  report the results for the EPFL benchmark circuits (most recent and up to 70X
+  large)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Preserving Tree-Based Inference with TFHE 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01254v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01254v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Frery, Andrei Stoian, Roman Bredehoft, Luis Montero, Celia Kherfallah, Benoit Chevallier-Mames, Arthur Meyre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy enhancing technologies (PETs) have been proposed as a way to protect
+the privacy of data while still allowing for data analysis. In this work, we
+focus on Fully Homomorphic Encryption (FHE), a powerful tool that allows for
+arbitrary computations to be performed on encrypted data. FHE has received lots
+of attention in the past few years and has reached realistic execution times
+and correctness.
+  More precisely, we explain in this paper how we apply FHE to tree-based
+models and get state-of-the-art solutions over encrypted tabular data. We show
+that our method is applicable to a wide range of tree-based models, including
+decision trees, random forests, and gradient boosted trees, and has been
+implemented within the Concrete-ML library, which is open-source at
+https://github.com/zama-ai/concrete-ml. With a selected set of use-cases, we
+demonstrate that our FHE version is very close to the unprotected version in
+terms of accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and
+  Synthetic-to-Real Adaptation <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10510v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10510v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Ying Yeh, Koki Nagano, Sameh Khamis, Jan Kautz, Ming-Yu Liu, Ting-Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a portrait image of a person and an environment map of the target
+lighting, portrait relighting aims to re-illuminate the person in the image as
+if the person appeared in an environment with the target lighting. To achieve
+high-quality results, recent methods rely on deep learning. An effective
+approach is to supervise the training of deep neural networks with a
+high-fidelity dataset of desired input-output pairs, captured with a light
+stage. However, acquiring such data requires an expensive special capture rig
+and time-consuming efforts, limiting access to only a few resourceful
+laboratories. To address the limitation, we propose a new approach that can
+perform on par with the state-of-the-art (SOTA) relighting methods without
+requiring a light stage. Our approach is based on the realization that a
+successful relighting of a portrait image depends on two conditions. First, the
+method needs to mimic the behaviors of physically-based relighting. Second, the
+output has to be photorealistic. To meet the first condition, we propose to
+train the relighting network with training data generated by a virtual light
+stage that performs physically-based rendering on various 3D synthetic humans
+under different environment maps. To meet the second condition, we develop a
+novel synthetic-to-real approach to bring photorealism to the relighting
+network output. In addition to achieving SOTA results, our approach offers
+several advantages over the prior methods, including controllable glares on
+glasses and more temporally-consistent results for relighting videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21
+  pages, 25 figures, 7 tables. Project page:
+  https://research.nvidia.com/labs/dir/lumos/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Rule-based Named Entity Recognition and Relation Extraction for
+  Process Model Generation from Natural Language Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Neuberger, Lars Ackermann, Stefan Jablonski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Process-aware information systems offer extensive advantages to companies,
+facilitating planning, operations, and optimization of day-to-day business
+activities. However, the time-consuming but required step of designing formal
+business process models often hampers the potential of these systems. To
+overcome this challenge, automated generation of business process models from
+natural language text has emerged as a promising approach to expedite this
+step. Generally two crucial subtasks have to be solved: extracting
+process-relevant information from natural language and creating the actual
+model. Approaches towards the first subtask are rule based methods, highly
+optimized for specific domains, but hard to adapt to related applications. To
+solve this issue, we present an extension to an existing pipeline, to make it
+entirely data driven. We demonstrate the competitiveness of our improved
+pipeline, which not only eliminates the substantial overhead associated with
+feature engineering and rule definition, but also enables adaptation to
+different datasets, entity and relation types, and new domains. Additionally,
+the largest available dataset (PET) for the first subtask, contains no
+information about linguistic references between mentions of entities in the
+process description. Yet, the resolution of these mentions into a single visual
+element is essential for high quality process models. We propose an extension
+to the PET dataset that incorporates information about linguistic references
+and a corresponding method for resolving them. Finally, we provide a detailed
+analysis of the inherent challenges in the dataset at hand.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review for CoopIS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Membership Inference Attacks against Language Models via Neighbourhood
+  Comparison 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justus Mattern, Fatemehsadat Mireshghallah, Zhijing Jin, Bernhard Schölkopf, Mrinmaya Sachan, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Membership Inference attacks (MIAs) aim to predict whether a data sample was
+present in the training data of a machine learning model or not, and are widely
+used for assessing the privacy risks of language models. Most existing attacks
+rely on the observation that models tend to assign higher probabilities to
+their training samples than non-training points. However, simple thresholding
+of the model score in isolation tends to lead to high false-positive rates as
+it does not account for the intrinsic complexity of a sample. Recent work has
+demonstrated that reference-based attacks which compare model scores to those
+obtained from a reference model trained on similar data can substantially
+improve the performance of MIAs. However, in order to train reference models,
+attacks of this kind make the strong and arguably unrealistic assumption that
+an adversary has access to samples closely resembling the original training
+data. Therefore, we investigate their performance in more realistic scenarios
+and find that they are highly fragile in relation to the data distribution used
+to train reference models. To investigate whether this fragility provides a
+layer of safety, we propose and evaluate neighbourhood attacks, which compare
+model scores for a given sample to scores of synthetically generated neighbour
+texts and therefore eliminate the need for access to the training data
+distribution. We show that, in addition to being competitive with
+reference-based attacks that have perfect knowledge about the training data
+distribution, our attack clearly outperforms existing reference-free attacks as
+well as reference-based attacks with imperfect knowledge, which demonstrates
+the need for a reevaluation of the threat model of adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interval Reachability of Nonlinear Dynamical Systems with Neural Network
+  Controllers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saber Jafarpour, Akash Harapanahalli, Samuel Coogan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a computationally efficient framework, based on interval
+analysis, for rigorous verification of nonlinear continuous-time dynamical
+systems with neural network controllers. Given a neural network, we use an
+existing verification algorithm to construct inclusion functions for its
+input-output behavior. Inspired by mixed monotone theory, we embed the
+closed-loop dynamics into a larger system using an inclusion function of the
+neural network and a decomposition function of the open-loop system. This
+embedding provides a scalable approach for safety analysis of the neural
+control loop while preserving the nonlinear structure of the system.
+  We show that one can efficiently compute hyper-rectangular
+over-approximations of the reachable sets using a single trajectory of the
+embedding system. We design an algorithm to leverage this computational
+advantage through partitioning strategies, improving our reachable set
+estimates while balancing its runtime with tunable parameters. We demonstrate
+the performance of this algorithm through two case studies. First, we
+demonstrate this method's strength in complex nonlinear environments. Then, we
+show that our approach matches the performance of the state-of-the art
+verification algorithm for linear discretized systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended L4DC version with proofs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Bipedal Walking for Humanoids with Current Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03724v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03724v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Pratap Singh, Zhaoming Xie, Pierre Gergondet, Fumio Kanehiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep reinforcement learning (RL) based techniques combined
+with training in simulation have offered a new approach to developing robust
+controllers for legged robots. However, the application of such approaches to
+real hardware has largely been limited to quadrupedal robots with direct-drive
+actuators and light-weight bipedal robots with low gear-ratio transmission
+systems. Application to real, life-sized humanoid robots has been less common
+arguably due to a large sim2real gap. In this paper, we present an approach for
+effectively overcoming the sim2real gap issue for humanoid robots arising from
+inaccurate torque-tracking at the actuator level. Our key idea is to utilize
+the current feedback from the actuators on the real robot, after training the
+policy in a simulation environment artificially degraded with poor
+torque-tracking. Our approach successfully trains a unified, end-to-end policy
+in simulation that can be deployed on a real HRP-5P humanoid robot to achieve
+bipedal locomotion. Through ablations, we also show that a feedforward policy
+architecture combined with targeted dynamics randomization is sufficient for
+zero-shot sim2real success, thus eliminating the need for computationally
+expensive, memory-based network architectures. Finally, we validate the
+robustness of the proposed RL policy by comparing its performance against a
+conventional model-based controller for walking on uneven terrain with the real
+robot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor Programs IVb: Adaptive Optimization in the Infinite-Width Limit <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01814v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01814v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Greg Yang, Etai Littwin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Going beyond stochastic gradient descent (SGD), what new phenomena emerge in
+wide neural networks trained by adaptive optimizers like Adam? Here we show:
+The same dichotomy between feature learning and kernel behaviors (as in SGD)
+holds for general optimizers as well, including Adam -- albeit with a nonlinear
+notion of "kernel." We derive the corresponding "neural tangent" and "maximal
+update" limits for any architecture. Two foundational advances underlie the
+above results: 1) A new Tensor Program language, NEXORT, that can express how
+adaptive optimizers process gradients into updates. 2) The introduction of
+bra-ket notation to drastically simplify expressions and calculations in Tensor
+Programs. This work summarizes and generalizes all previous results in the
+Tensor Programs series of papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the complete version of "Adaptive Optimization in the
+  Infinite-Width Limit" in ICLR 2023,
+  https://openreview.net/forum?id=zgVDqw9ZUES</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intensity-free Integral-based Learning of Marked Temporal Point
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sishun Liu, Ke Deng, Xiuzhen Zhang, Yongli Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the marked temporal point processes (MTPP), a core problem is to
+parameterize the conditional joint PDF (probability distribution function)
+$p^*(m,t)$ for inter-event time $t$ and mark $m$, conditioned on the history.
+The majority of existing studies predefine intensity functions. Their utility
+is challenged by specifying the intensity function's proper form, which is
+critical to balance expressiveness and processing efficiency. Recently, there
+are studies moving away from predefining the intensity function -- one models
+$p^*(t)$ and $p^*(m)$ separately, while the other focuses on temporal point
+processes (TPPs), which do not consider marks. This study aims to develop
+high-fidelity $p^*(m,t)$ for discrete events where the event marks are either
+categorical or numeric in a multi-dimensional continuous space. We propose a
+solution framework IFIB (\underline{I}ntensity-\underline{f}ree
+\underline{I}ntegral-\underline{b}ased process) that models conditional joint
+PDF $p^*(m,t)$ directly without intensity functions. It remarkably simplifies
+the process to compel the essential mathematical restrictions. We show the
+desired properties of IFIB and the superior experimental results of IFIB on
+real-world and synthetic datasets. The code is available at
+\url{https://github.com/StepinSilence/IFIB}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transfer Learning with Deep Tabular Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.15306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.15306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Levin, Valeriia Cherepanova, Avi Schwarzschild, Arpit Bansal, C. Bayan Bruss, Tom Goldstein, Andrew Gordon Wilson, Micah Goldblum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work on deep learning for tabular data demonstrates the strong
+performance of deep tabular models, often bridging the gap between gradient
+boosted decision trees and neural networks. Accuracy aside, a major advantage
+of neural models is that they learn reusable features and are easily fine-tuned
+in new domains. This property is often exploited in computer vision and natural
+language applications, where transfer learning is indispensable when
+task-specific training data is scarce. In this work, we demonstrate that
+upstream data gives tabular neural networks a decisive advantage over widely
+used GBDT models. We propose a realistic medical diagnosis benchmark for
+tabular transfer learning, and we present a how-to guide for using upstream
+data to boost performance with a variety of tabular neural network
+architectures. Finally, we propose a pseudo-feature method for cases where the
+upstream and downstream feature sets differ, a tabular-specific problem
+widespread in real-world applications. Our code is available at
+https://github.com/LevinRoman/tabular-transfer-learning .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral Regularized Kernel Two-Sample Tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09201v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09201v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Hagrass, Bharath K. Sriperumbudur, Bing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the last decade, an approach that has gained a lot of popularity to
+tackle non-parametric testing problems on general (i.e., non-Euclidean) domains
+is based on the notion of reproducing kernel Hilbert space (RKHS) embedding of
+probability distributions. The main goal of our work is to understand the
+optimality of two-sample tests constructed based on this approach. First, we
+show that the popular MMD (maximum mean discrepancy) two-sample test is not
+optimal in terms of the separation boundary measured in Hellinger distance.
+Second, we propose a modification to the MMD test based on spectral
+regularization by taking into account the covariance information (which is not
+captured by the MMD test) and prove the proposed test to be minimax optimal
+with a smaller separation boundary than that achieved by the MMD test. Third,
+we propose an adaptive version of the above test which involves a data-driven
+strategy to choose the regularization parameter and show the adaptive test to
+be almost minimax optimal up to a logarithmic factor. Moreover, our results
+hold for the permutation variant of the test where the test threshold is chosen
+elegantly through the permutation of the samples. Through numerical experiments
+on synthetic and real-world data, we demonstrate the superior performance of
+the proposed test in comparison to the MMD test.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>63 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-Modification Adversarial Attacks for Natural Language Processing:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.00676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.00676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Roth, Yansong Gao, Alsharif Abuadbba, Surya Nepal, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are now many adversarial attacks for natural language processing
+systems. Of these, a vast majority achieve success by modifying individual
+document tokens, which we call here a token-modification attack. Each
+token-modification attack is defined by a specific combination of fundamental
+components, such as a constraint on the adversary or a particular search
+algorithm. Motivated by this observation, we survey existing token-modification
+attacks and extract the components of each. We use an attack-independent
+framework to structure our survey which results in an effective categorisation
+of the field and an easy comparison of components. This survey aims to guide
+new researchers to this field and spark further research into individual attack
+components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version 2: updated</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defending against Insertion-based Textual Backdoor Attacks via
+  Attribution <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhao Li, Zhuofeng Wu, Wei Ping, Chaowei Xiao, V. G. Vinod Vydiswaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textual backdoor attack, as a novel attack model, has been shown to be
+effective in adding a backdoor to the model during training. Defending against
+such backdoor attacks has become urgent and important. In this paper, we
+propose AttDef, an efficient attribution-based pipeline to defend against two
+insertion-based poisoning attacks, BadNL and InSent. Specifically, we regard
+the tokens with larger attribution scores as potential triggers since larger
+attribution words contribute more to the false prediction results and therefore
+are more likely to be poison triggers. Additionally, we further utilize an
+external pre-trained language model to distinguish whether input is poisoned or
+not. We show that our proposed method can generalize sufficiently well in two
+common attack scenarios (poisoning training data and testing data), which
+consistently improves previous methods. For instance, AttDef can successfully
+mitigate both attacks with an average accuracy of 79.97% (56.59% up) and 48.34%
+(3.99% up) under pre-training and post-training attack defense respectively,
+achieving the new state-of-the-art performance on prediction recovery over four
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2023. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepQ-ViT: Scale Reparameterization for Post-Training Quantization of
+  Vision <span class="highlight-title">Transformer</span>s <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08254v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08254v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Junrui Xiao, Lianwei Yang, Qingyi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ), which only requires a tiny dataset for
+calibration without end-to-end retraining, is a light and practical model
+compression technique. Recently, several PTQ schemes for vision transformers
+(ViTs) have been presented; unfortunately, they typically suffer from
+non-trivial accuracy degradation, especially in low-bit cases. In this paper,
+we propose RepQ-ViT, a novel PTQ framework for ViTs based on quantization scale
+reparameterization, to address the above issues. RepQ-ViT decouples the
+quantization and inference processes, where the former employs complex
+quantizers and the latter employs scale-reparameterized simplified quantizers.
+This ensures both accurate quantization and efficient inference, which
+distinguishes it from existing approaches that sacrifice quantization
+performance to meet the target hardware. More specifically, we focus on two
+components with extreme distributions: post-LayerNorm activations with severe
+inter-channel variation and post-Softmax activations with power-law features,
+and initially apply channel-wise quantization and log$\sqrt{2}$ quantization,
+respectively. Then, we reparameterize the scales to hardware-friendly
+layer-wise quantization and log2 quantization for inference, with only slight
+accuracy or computational costs. Extensive experiments are conducted on
+multiple vision tasks with different model variants, proving that RepQ-ViT,
+without hyperparameters and expensive reconstruction procedures, can outperform
+existing strong baselines and encouragingly improve the accuracy of 4-bit PTQ
+of ViTs to a usable level. Code is available at
+https://github.com/zkkli/RepQ-ViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case
+  Study in Oncology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cliff Wong, Sheng Zhang, Yu Gu, Christine Moung, Jacob Abel, Naoto Usuyama, Roshanthi Weerasinghe, Brian Piening, Tristan Naumann, Carlo Bifulco, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trial matching is a key process in health delivery and discovery. In
+practice, it is plagued by overwhelming unstructured data and unscalable manual
+processing. In this paper, we conduct a systematic study on scaling clinical
+trial matching using large language models (LLMs), with oncology as the focus
+area. Our study is grounded in a clinical trial matching system currently in
+test deployment at a large U.S. health network. Initial findings are promising:
+out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate
+eligibility criteria of clinical trials and extract complex matching logic
+(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially
+outperform prior strong baselines and may serve as a preliminary solution to
+help triage patient-trial candidates with humans in the loop. Our study also
+reveals a few significant growth areas for applying LLMs to end-to-end clinical
+trial matching, such as context limitation and accuracy, especially in
+structuring patient information from longitudinal medical records.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted at Machine Learning for Healthcare
+  (MLHC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Parameter Identification via a Hyperparameter Optimization Scheme
+  for Autonomous Racing Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01470v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01470v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunki Seong, Chanyoung Chung, David Hyunchul Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this letter, we propose a model parameter identification method via a
+hyperparameter optimization scheme (MI-HPO). Our method adopts an efficient
+explore-exploit strategy to identify the parameters of dynamic models in a
+data-driven optimization manner. We utilize our method for model parameter
+identification of the AV-21, a full-scaled autonomous race vehicle. We then
+incorporate the optimized parameters for the design of model-based planning and
+control systems of our platform. In experiments, MI-HPO exhibits more than 13
+times faster convergence than traditional parameter identification methods.
+Furthermore, the parametric models learned via MI-HPO demonstrate good fitness
+to the given datasets and show generalization ability in unseen dynamic
+scenarios. We further conduct extensive field tests to validate our model-based
+system, demonstrating stable obstacle avoidance and high-speed driving up to
+217 km/h at the Indianapolis Motor Speedway and Las Vegas Motor Speedway. The
+source code for our work and videos of the tests are available at
+https://github.com/hynkis/MI-HPO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures. Published in IEEE Control Systems Letters (L-CSS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus
+  Sample Average Approximation: A Stochastic Dominance Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06833v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06833v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam N. Elmachtoub, Henry Lam, Haofeng Zhang, Yunfan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In data-driven stochastic optimization, model parameters of the underlying
+distribution need to be estimated from data in addition to the optimization
+task. Recent literature considers integrating the estimation and optimization
+processes by selecting model parameters that lead to the best empirical
+objective performance. This integrated approach, which we call
+integrated-estimation-optimization (IEO), can be readily shown to outperform
+simple estimate-then-optimize (ETO) when the model is misspecified. In this
+paper, we show that a reverse behavior appears when the model class is
+well-specified and there is sufficient data. Specifically, for a general class
+of nonlinear stochastic optimization problems, we show that simple ETO
+outperforms IEO asymptotically when the model class covers the ground truth, in
+the strong sense of stochastic dominance of the regret. Namely, the entire
+distribution of the regret, not only its mean or other moments, is always
+better for ETO compared to IEO. Our results also apply to constrained,
+contextual optimization problems where the decision depends on observed
+features. Whenever applicable, we also demonstrate how standard sample average
+approximation (SAA) performs the worst when the model class is well-specified
+in terms of regret, and best when it is misspecified. Finally, we provide
+experimental results to support our theoretical comparisons and illustrate when
+our insights hold in finite-sample regimes and under various degrees of
+misspecification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FFNeRV: Flow-Guided Frame-Wise Neural Representations for Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joo Chan Lee, Daniel Rho, Jong Hwan Ko, Eunbyung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural fields, also known as coordinate-based or implicit neural
+representations, have shown a remarkable capability of representing,
+generating, and manipulating various forms of signals. For video
+representations, however, mapping pixel-wise coordinates to RGB colors has
+shown relatively low compression performance and slow convergence and inference
+speed. Frame-wise video representation, which maps a temporal coordinate to its
+entire frame, has recently emerged as an alternative method to represent
+videos, improving compression rates and encoding speed. While promising, it has
+still failed to reach the performance of state-of-the-art video compression
+algorithms. In this work, we propose FFNeRV, a novel method for incorporating
+flow information into frame-wise representations to exploit the temporal
+redundancy across the frames in videos inspired by the standard video codecs.
+Furthermore, we introduce a fully convolutional architecture, enabled by
+one-dimensional temporal grids, improving the continuity of spatial features.
+Experimental results show that FFNeRV yields the best performance for video
+compression and frame interpolation among the methods using frame-wise
+representations or neural fields. To reduce the model size even further, we
+devise a more compact convolutional architecture using the group and pointwise
+convolutions. With model compression techniques, including quantization-aware
+training and entropy coding, FFNeRV outperforms widely-used standard video
+codecs (H.264 and HEVC) and performs on par with state-of-the-art video
+compression algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page including code is available at
+  https://maincold2.github.io/ffnerv/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Fine-Tuning of Deep Neural Networks with Hessian-based
+  Generalization Guarantees <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02659v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02659v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Ju, Dongyue Li, Hongyang R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider fine-tuning a pretrained deep neural network on a target task. We
+study the generalization properties of fine-tuning to understand the problem of
+overfitting, which has often been observed (e.g., when the target dataset is
+small or when the training labels are noisy). Existing generalization measures
+for deep networks depend on notions such as distance from the initialization
+(i.e., the pretrained network) of the fine-tuned model and noise stability
+properties of deep networks. This paper identifies a Hessian-based distance
+measure through PAC-Bayesian analysis, which is shown to correlate well with
+observed generalization gaps of fine-tuned models. Theoretically, we prove
+Hessian distance-based generalization bounds for fine-tuned models. We also
+describe an extended study of fine-tuning against label noise, where
+overfitting is against a critical problem; We present an algorithm and a
+generalization error guarantee for this algorithm under a class conditional
+independent noise model. Empirically, we observe that the Hessian-based
+distance measure can match the scale of the observed generalization gap of
+fine-tuned models in practice. We also test our algorithm on several image
+classification tasks with noisy training labels, showing notable gains over
+prior methods, and the Hessian distance measure of the fine-tuned model
+decreases substantially.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages. Appeared in ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Exact Kernel Equivalence for Finite Classification Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Bell, Michael Geyer, David Glickenstein, Amanda Fernandez, Juston Moore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the equivalence between neural networks and kernel methods by
+deriving the first exact representation of any finite-size parametric
+classification model trained with gradient descent as a kernel machine. We
+compare our exact representation to the well-known Neural Tangent Kernel (NTK)
+and discuss approximation error relative to the NTK and other non-exact path
+kernel formulations. We experimentally demonstrate that the kernel can be
+computed for realistic networks up to machine precision. We use this exact
+kernel to show that our theoretical contribution can provide useful insights
+into the predictions made by neural networks, particularly the way in which
+they generalize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in
+  Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Rendering for Synthetic Aperture Radar Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Wilmanski, Jonathan Tamir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is rising interest in differentiable rendering, which allows explicitly
+modeling geometric priors and constraints in optimization pipelines using
+first-order methods such as backpropagation. Incorporating such domain
+knowledge can lead to deep neural networks that are trained more robustly and
+with limited data, as well as the capability to solve ill-posed inverse
+problems. Existing efforts in differentiable rendering have focused on imagery
+from electro-optical sensors, particularly conventional RGB-imagery. In this
+work, we propose an approach for differentiable rendering of Synthetic Aperture
+Radar (SAR) imagery, which combines methods from 3D computer graphics with
+neural rendering. We demonstrate the approach on the inverse graphics problem
+of 3D Object Reconstruction from limited SAR imagery using high-fidelity
+simulated SAR data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the manuscript is an updated preprint which has been
+  recently accepted by IEEE Transactions on Aerospace Electronic Systems, but
+  has not yet been published or processed by IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Razors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wai-yin Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When performing causal discovery, assumptions have to be made on how the true
+causal mechanism corresponds to the underlying joint probability distribution.
+These assumptions are labeled as causal razors in this work. We review numerous
+causal razors that appeared in the literature, and offer a comprehensive
+logical comparison of them. In particular, we scrutinize an unpopular causal
+razor, namely parameter minimality, in multinomial causal models and its
+logical relations with other well-studied causal razors. Our logical result
+poses a dilemma in selecting a reasonable scoring criterion for score-based
+casual search algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages for the main paper. 14 pages for the supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Representation Learning for Automatic Speech Recognition <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad V Ramesh, Gopinath Chennupati, Milind Rao, Anit Kumar Sahu, Ariya Rastrow, Jasha Droppo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge
+devices to learn collaboratively without sharing data. Edge devices like Alexa
+and Siri are prospective sources of unlabeled audio data that can be tapped to
+learn robust audio representations. In this work, we bring Self-supervised
+Learning (SSL) and FL together to learn representations for Automatic Speech
+Recognition respecting data privacy constraints. We use the speaker and chapter
+information in the unlabeled speech dataset, Libri-Light, to simulate non-IID
+speaker-siloed data distributions and pre-train an LSTM encoder with the
+Contrastive Predictive Coding framework with FedSGD. We show that the
+pre-trained ASR encoder in FL performs as well as a centrally pre-trained model
+and produces an improvement of 12-15% (WER) compared to no pre-training. We
+further adapt the federated pre-trained models to a new language, French, and
+show a 20% (WER) improvement over no pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy
+  in Speech Communication, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferability Properties of Graph Neural Networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.04629v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.04629v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luana Ruiz, Luiz F. O. Chamon, Alejandro Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are composed of layers consisting of graph
+convolutions and pointwise nonlinearities. Due to their invariance and
+stability properties, GNNs are provably successful at learning representations
+from data supported on moderate-scale graphs. However, they are difficult to
+learn on large-scale graphs. In this paper, we study the problem of training
+GNNs on graphs of moderate size and transferring them to large-scale graphs. We
+use graph limits called graphons to define limit objects for graph filters and
+GNNs -- graphon filters and graphon neural networks (WNNs) -- which we
+interpret as generative models for graph filters and GNNs. We then show that
+graphon filters and WNNs can be approximated by graph filters and GNNs sampled
+from them on weighted and stochastic graphs. Because the error of these
+approximations can be upper bounded, by a triangle inequality argument we can
+further bound the error of transferring a graph filter or a GNN across graphs.
+Our results show that (i) the transference error decreases with the graph size,
+and (ii) that graph filters have a transferability-discriminability tradeoff
+that in GNNs is alleviated by the scattering behavior of the nonlinearity.
+These findings are demonstrated empirically in a movie recommendation problem
+and in a decentralized control task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE TSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Trust Race Prediction? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cangyuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the absence of sensitive race and ethnicity data, researchers, regulators,
+and firms alike turn to proxies. In this paper, I train a Bidirectional Long
+Short-Term Memory (BiLSTM) model on a novel dataset of voter registration data
+from all 50 US states and create an ensemble that achieves up to 36.8% higher
+out of sample (OOS) F1 scores than the best performing machine learning models
+in the literature. Additionally, I construct the most comprehensive database of
+first and surname distributions in the US in order to improve the coverage and
+accuracy of Bayesian Improved Surname Geocoding (BISG) and Bayesian Improved
+Firstname Surname Geocoding (BIFSG). Finally, I provide the first high-quality
+benchmark dataset in order to fairly compare existing models and aid future
+model developers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring and Modeling Physical Intrinsic Motivation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13452v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13452v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julio Martinez, Felix Binder, Haoliang Wang, Nick Haber, Judith Fan, Daniel L. K. Yamins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans are interactive agents driven to seek out situations with interesting
+physical dynamics. Here we formalize the functional form of physical intrinsic
+motivation. We first collect ratings of how interesting humans find a variety
+of physics scenarios. We then model human interestingness responses by
+implementing various hypotheses of intrinsic motivation including models that
+rely on simple scene features to models that depend on forward physics
+prediction. We find that the single best predictor of human responses is
+adversarial reward, a model derived from physical prediction loss. We also find
+that simple scene feature models do not generalize their prediction of human
+responses across all scenarios. Finally, linearly combining the adversarial
+model with the number of collisions in a scene leads to the greatest
+improvement in predictivity of human responses, suggesting humans are driven
+towards scenarios that result in high information gain and physical activity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, accepted to CogSci 2023 with full paper
+  publication in the proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Causal Representation Learning and Deconfounding from Indefinite
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02640v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02640v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Chen, Xinyu Yang, Qing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We redefine causal data from two novel perspectives: the number of causal
+skeletons and the dimension of causal variables, thereby proposing three data
+paradigms. Among them, the indefinite data (like dialogues or video sources) is
+characterized by multi-skeleton structures and multi-value variables. Multi
+skeletons induce low sample utilization, and multi values induce incapability
+of the distribution assumption, both leading to the fact that learning causal
+representation from indefinite data is, as of yet, largely unexplored. We
+design the causal strength variational model to settle down these two problems.
+Specifically, we leverage the causal strength instead of independent noise as
+the latent variable to construct evidence lower bound. By this design ethos,
+The causal strengths of different skeletons are regarded as a distribution and
+can be expressed as a single-valued causal graph matrix. Moreover, considering
+the latent confounders, we disentangle the causal graph G into two relation
+subgraphs O and C. O contains pure relations between observed variables, while
+C represents the relations from latent variables to observed variables. We
+implement the above designs as a dynamic variational inference model, tailored
+to learn causal representation from indefinite data under latent confounding.
+Finally, we conduct comprehensive experiments on synthetic and real-world data
+to demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online learning techniques for prediction of temporal tabular <span class="highlight-title">dataset</span>s
+  with regime changes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00790v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00790v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Wong, Mauricio Barahona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of deep learning to non-stationary temporal datasets can lead
+to overfitted models that underperform under regime changes. In this work, we
+propose a modular machine learning pipeline for ranking predictions on temporal
+panel datasets which is robust under regime changes. The modularity of the
+pipeline allows the use of different models, including Gradient Boosting
+Decision Trees (GBDTs) and Neural Networks, with and without feature
+engineering. We evaluate our framework on financial data for stock portfolio
+prediction, and find that GBDT models with dropout display high performance,
+robustness and generalisability with reduced complexity and computational cost.
+We then demonstrate how online learning techniques, which require no retraining
+of models, can be used post-prediction to enhance the results. First, we show
+that dynamic feature projection improves robustness by reducing drawdown in
+regime changes. Second, we demonstrate that dynamical model ensembling based on
+selection of models with good recent performance leads to improved Sharpe and
+Calmar ratios of out-of-sample predictions. We also evaluate the robustness of
+our pipeline across different data splits and random seeds with good
+reproducibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-based Person Re-identification with Long Short-Term Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuehu Liu, Pingping Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based person Re-Identification (V-ReID) aims to retrieve specific
+persons from raw videos captured by non-overlapped cameras. As a fundamental
+task, it spreads many multimedia and computer vision applications. However, due
+to the variations of persons and scenes, there are still many obstacles that
+must be overcome for high performance. In this work, we notice that both the
+long-term and short-term information of persons are important for robust video
+representations. Thus, we propose a novel deep learning framework named Long
+Short-Term Representation Learning (LSTRL) for effective V-ReID. More
+specifically, to extract long-term representations, we propose a
+Multi-granularity Appearance Extractor (MAE), in which four granularity
+appearances are effectively captured across multiple frames. Meanwhile, to
+extract short-term representations, we propose a Bi-direction Motion Estimator
+(BME), in which reciprocal motion information is efficiently extracted from
+consecutive frames. The MAE and BME are plug-and-play and can be easily
+inserted into existing networks for efficient feature learning. As a result,
+they significantly improve the feature representation ability for V-ReID.
+Extensive experiments on three widely used benchmarks show that our proposed
+approach can deliver better performances than most state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ICIG2023, including 13 pages, 5 figures and
+  5 tables. Modifications may be performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mamba: Bringing Multi-Dimensional ABR to WebRTC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueheng Li, Zicheng Zhang, Hao Chen, Zhan Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary real-time video communication systems, such as WebRTC, use an
+adaptive bitrate (ABR) algorithm to assure high-quality and low-delay services,
+e.g., promptly adjusting video bitrate according to the instantaneous network
+bandwidth. However, target bitrate decisions in the network and bitrate control
+in the codec are typically incoordinated and simply ignoring the effect of
+inappropriate resolution and frame rate settings also leads to compromised
+results in bitrate control, thus devastatingly deteriorating the quality of
+experience (QoE). To tackle these challenges, Mamba, an end-to-end
+multi-dimensional ABR algorithm is proposed, which utilizes multi-agent
+reinforcement learning (MARL) to maximize the user's QoE by adaptively and
+collaboratively adjusting encoding factors including the quantization
+parameters (QP), resolution, and frame rate based on observed states such as
+network conditions and video complexity information in a video conferencing
+system. We also introduce curriculum learning to improve the training
+efficiency of MARL. Both the in-lab and real-world evaluation results
+demonstrate the remarkable efficacy of Mamba.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 31st ACM International Conference on
+  Multimedia, October 29-November 3, 2023, Ottawa, ON, Canada. ACM, New York,
+  NY, USA, 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COPA: Efficient Vision-Language <span class="highlight-title">Pre-train</span>ing Through Collaborative
+  Object- and Patch-Text Alignment <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoya Jiang, Haiyang Xu, Wei Ye, Qinghao Ye, Chenliang Li, Ming Yan, Bin Bi, Shikun Zhang, Ji Zhang, Fei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Pre-training (VLP) methods based on object detection enjoy
+the rich knowledge of fine-grained object-text alignment but at the cost of
+computationally expensive inference. Recent Visual-Transformer (ViT)-based
+approaches circumvent this issue while struggling with long visual sequences
+without detailed cross-modal alignment information. This paper introduces a
+ViT-based VLP technique that efficiently incorporates object information
+through a novel patch-text alignment mechanism. Specifically, we convert
+object-level signals into patch-level ones and devise a Patch-Text Alignment
+pre-training task (PTA) to learn a text-aware patch detector. By using
+off-the-shelf delicate object annotations in 5\% training images, we jointly
+train PTA with other conventional VLP objectives in an end-to-end manner,
+bypassing the high computational cost of object detection and yielding an
+effective patch detector that accurately detects text-relevant patches, thus
+considerably reducing patch sequences and accelerating computation within the
+ViT backbone. Our experiments on a variety of widely-used benchmarks reveal
+that our method achieves a speedup of nearly 88\% compared to prior VLP models
+while maintaining competitive or superior performance on downstream tasks with
+similar model size and data scale.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongjie Duan, Lizhou You, Chengyu Wang, Cen Chen, Ziheng Wu, Weining Qian, Jun Huang, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, diffusion models have emerged as the most powerful approach
+in image synthesis. However, applying these models directly to video synthesis
+presents challenges, as it often leads to noticeable flickering contents.
+Although recently proposed zero-shot methods can alleviate flicker to some
+extent, we still struggle to generate coherent videos. In this paper, we
+propose DiffSynth, a novel approach that aims to convert image synthesis
+pipelines to video synthesis pipelines. DiffSynth consists of two key
+components: a latent in-iteration deflickering framework and a video
+deflickering algorithm. The latent in-iteration deflickering framework applies
+video deflickering to the latent space of diffusion models, effectively
+preventing flicker accumulation in intermediate steps. Additionally, we propose
+a video deflickering algorithm, named patch blending algorithm, that remaps
+objects in different frames and blends them together to enhance video
+consistency. One of the notable advantages of DiffSynth is its general
+applicability to various video synthesis tasks, including text-guided video
+stylization, fashion video synthesis, image-guided video stylization, video
+restoring, and 3D rendering. In the task of text-guided video stylization, we
+make it possible to synthesize high-quality videos without cherry-picking. The
+experimental results demonstrate the effectiveness of DiffSynth. All videos can
+be viewed on our project page. Source codes will also be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cuing Without Sharing: A Federated Cued Speech Recognition Framework via
+  Mutual Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhang, Lei Liu, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cued Speech (CS) is a visual coding tool to encode spoken languages at the
+phonetic level, which combines lip-reading and hand gestures to effectively
+assist communication among people with hearing impairments. The Automatic CS
+Recognition (ACSR) task aims to recognize CS videos into linguistic texts,
+which involves both lips and hands as two distinct modalities conveying
+complementary information. However, the traditional centralized training
+approach poses potential privacy risks due to the use of facial and gesture
+videos in CS data. To address this issue, we propose a new Federated Cued
+Speech Recognition (FedCSR) framework to train an ACSR model over the
+decentralized CS data without sharing private information. In particular, a
+mutual knowledge distillation method is proposed to maintain cross-modal
+semantic consistency of the Non-IID CS data, which ensures learning a unified
+feature space for both linguistic and visual information. On the server side, a
+globally shared linguistic model is trained to capture the long-term
+dependencies in the text sentences, which is aligned with the visual
+information from the local clients via visual-to-linguistic distillation. On
+the client side, the visual model of each client is trained with its own local
+data, assisted by linguistic-to-visual distillation treating the linguistic
+model as the teacher. To the best of our knowledge, this is the first approach
+to consider the federated ACSR task for privacy protection. Experimental
+results on the Chinese CS dataset with multiple cuers demonstrate that our
+approach outperforms both mainstream federated learning baselines and existing
+centralized state-of-the-art ACSR methods, achieving 9.7% performance
+improvement for character error rate (CER) and 15.0% for word error rate (WER).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Skeleton-based Action Recognition via Mutual Information
+  Estimation and Maximization <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Zhou, Wenwen Qiang, Anyi Rao, Ning Lin, Bing Su, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot skeleton-based action recognition aims to recognize actions of
+unseen categories after training on data of seen categories. The key is to
+build the connection between visual and semantic space from seen to unseen
+classes. Previous studies have primarily focused on encoding sequences into a
+singular feature vector, with subsequent mapping the features to an identical
+anchor point within the embedded space. Their performance is hindered by 1) the
+ignorance of the global visual/semantic distribution alignment, which results
+in a limitation to capture the true interdependence between the two spaces. 2)
+the negligence of temporal information since the frame-wise features with rich
+action clues are directly pooled into a single feature vector. We propose a new
+zero-shot skeleton-based action recognition method via mutual information (MI)
+estimation and maximization. Specifically, 1) we maximize the MI between visual
+and semantic space for distribution alignment; 2) we leverage the temporal
+information for estimating the MI by encouraging MI to increase as more frames
+are observed. Extensive experiments on three large-scale skeleton action
+datasets confirm the effectiveness of our method. Code:
+https://github.com/YujieOuO/SMIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent Multi-scale <span class="highlight-title">Transformer</span> for High-Resolution Salient Object
+  Detection <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Deng, Pingping Zhang, Wei Liu, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient Object Detection (SOD) aims to identify and segment the most
+conspicuous objects in an image or video. As an important pre-processing step,
+it has many potential applications in multimedia and vision tasks. With the
+advance of imaging devices, SOD with high-resolution images is of great demand,
+recently. However, traditional SOD methods are largely limited to
+low-resolution images, making them difficult to adapt to the development of
+High-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no
+large enough datasets for training and evaluating. Besides, current HRSOD
+methods generally produce incomplete object regions and irregular object
+boundaries. To address above issues, in this work, we first propose a new
+HRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K
+resolution. As far as we know, it is the largest dataset for the HRSOD task,
+which will significantly help future works in training and evaluating models.
+Furthermore, to improve the HRSOD performance, we propose a novel Recurrent
+Multi-scale Transformer (RMFormer), which recurrently utilizes shared
+Transformers and multi-scale refinement architectures. Thus, high-resolution
+saliency maps can be generated with the guidance of lower-resolution
+predictions. Extensive experiments on both high-resolution and low-resolution
+benchmarks show the effectiveness and superiority of the proposed framework.
+The source code and dataset are released at:
+https://github.com/DrowsyMon/RMFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by ACM MM2023. More modifications may be
+  performed for further improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Radiology Report Generation by Learning with Increasingly Hard
+  Negatives <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhanu Prakash Voutharoja, Lei Wang, Luping Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic radiology report generation is challenging as medical images or
+reports are usually similar to each other due to the common content of anatomy.
+This makes a model hard to capture the uniqueness of individual images and is
+prone to producing undesired generic or mismatched reports. This situation
+calls for learning more discriminative features that could capture even
+fine-grained mismatches between images and reports. To achieve this, this paper
+proposes a novel framework to learn discriminative image and report features by
+distinguishing them from their closest peers, i.e., hard negatives. Especially,
+to attain more discriminative features, we gradually raise the difficulty of
+such a learning task by creating increasingly hard negative reports for each
+image in the feature space during training, respectively. By treating the
+increasingly hard negatives as auxiliary variables, we formulate this process
+as a min-max alternating optimisation problem. At each iteration, conditioned
+on a given set of hard negative reports, image and report features are learned
+as usual by minimising the loss functions related to report generation. After
+that, a new set of harder negative reports will be created by maximising a loss
+reflecting image-report alignment. By solving this optimisation, we attain a
+model that can generate more specific and accurate reports. It is noteworthy
+that our framework enhances discriminative feature learning without introducing
+extra network weights. Also, in contrast to the existing way of generating hard
+negatives, our framework extends beyond the granularity of the dataset by
+generating harder samples out of the training set. Experimental study on
+benchmark datasets verifies the efficacy of our framework and shows that it can
+serve as a plug-in to readily improve existing medical report generation
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to European Conference on Artificial Intelligence (ECAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-06T00:00:00Z">2023-08-06</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">20</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Linguistics Will Thrive in the 21st Century: A Reply to Piantadosi
+  (2023) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Kodner, Sarah Payne, Jeffrey Heinz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a critical assessment of Piantadosi's (2023) claim that "Modern
+language models refute Chomsky's approach to language," focusing on four main
+points. First, despite the impressive performance and utility of large language
+models (LLMs), humans achieve their capacity for language after exposure to
+several orders of magnitude less data. The fact that young children become
+competent, fluent speakers of their native languages with relatively little
+exposure to them is the central mystery of language learning to which Chomsky
+initially drew attention, and LLMs currently show little promise of solving
+this mystery. Second, what can the artificial reveal about the natural? Put
+simply, the implications of LLMs for our understanding of the cognitive
+structures and mechanisms underlying language and its acquisition are like the
+implications of airplanes for understanding how birds fly. Third, LLMs cannot
+constitute scientific theories of language for several reasons, not least of
+which is that scientific theories must provide interpretable explanations, not
+just predictions. This leads to our final point: to even determine whether the
+linguistic and cognitive capabilities of LLMs rival those of humans requires
+explicating what humans' capacities actually are. In other words, it requires a
+separate theory of language and cognition; generative linguistics provides
+precisely such a theory. As such, we conclude that generative linguistics as a
+scientific discipline will remain indispensable throughout the 21st century and
+beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigation of <span class="highlight-title">Self-supervised</span> <span class="highlight-title">Pre-train</span>ed Models for Classification
+  of Voice Quality from Speech and Neck Surface Accelerometer Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudarsana Reddy Kadiri, Farhad Javanmardi, Paavo Alku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior studies in the automatic classification of voice quality have mainly
+studied the use of the acoustic speech signal as input. Recently, a few studies
+have been carried out by jointly using both speech and neck surface
+accelerometer (NSA) signals as inputs, and by extracting MFCCs and glottal
+source features. This study examines simultaneously-recorded speech and NSA
+signals in the classification of voice quality (breathy, modal, and pressed)
+using features derived from three self-supervised pre-trained models
+(wav2vec2-BASE, wav2vec2-LARGE, and HuBERT) and using a SVM as well as CNNs as
+classifiers. Furthermore, the effectiveness of the pre-trained models is
+compared in feature extraction between glottal source waveforms and raw signal
+waveforms for both speech and NSA inputs. Using two signal processing methods
+(quasi-closed phase (QCP) glottal inverse filtering and zero frequency
+filtering (ZFF)), glottal source waveforms are estimated from both speech and
+NSA signals. The study has three main goals: (1) to study whether features
+derived from pre-trained models improve classification accuracy compared to
+conventional features (spectrogram, mel-spectrogram, MFCCs, i-vector, and
+x-vector), (2) to investigate which of the two modalities (speech vs. NSA) is
+more effective in the classification task with pre-trained model-based
+features, and (3) to evaluate whether the deep learning-based CNN classifier
+can enhance the classification accuracy in comparison to the SVM classifier.
+The results revealed that the use of the NSA input showed better classification
+performance compared to the speech signal. Between the features, the
+pre-trained model-based features showed better classification accuracies, both
+for speech and NSA inputs compared to the conventional features. It was also
+found that the HuBERT features performed better than the wav2vec2-BASE and
+wav2vec2-LARGE features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Computer Speech & Language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Average-Hard Attention <span class="highlight-title">Transformer</span>s are Constant-Depth Uniform Threshold
+  Circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lena Strobl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as a widely used neural network model for various
+natural language processing tasks. Previous research explored their
+relationship with constant-depth threshold circuits, making two assumptions:
+average-hard attention and logarithmic precision for internal computations
+relative to input length. Merrill et al. (2022) prove that average-hard
+attention transformers recognize languages that fall within the complexity
+class TC0, denoting the set of languages that can be recognized by
+constant-depth polynomial-size threshold circuits. Likewise, Merrill and
+Sabharwal (2023) show that log-precision transformers recognize languages
+within the class of uniform TC0. This shows that both transformer models can be
+simulated by constant-depth threshold circuits, with the latter being more
+robust due to generating a uniform circuit family. Our paper shows that the
+first result can be extended to yield uniform circuits as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatically Correcting Large Language Models: <span class="highlight-title">Survey</span>ing the landscape
+  of diverse self-correction strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangming Pan, Michael Saxon, Wenda Xu, Deepak Nathani, Xinyi Wang, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable performance across
+a wide array of NLP tasks. However, their efficacy is undermined by undesired
+and inconsistent behaviors, including hallucination, unfaithful reasoning, and
+toxic content. A promising approach to rectify these flaws is self-correction,
+where the LLM itself is prompted or guided to fix problems in its own output.
+Techniques leveraging automated feedback -- either produced by the LLM itself
+or some external system -- are of particular interest as they are a promising
+way to make LLM-based solutions more practical and deployable with minimal
+human feedback. This paper presents a comprehensive review of this emerging
+class of techniques. We analyze and taxonomize a wide array of recent work
+utilizing these strategies, including training-time, generation-time, and
+post-hoc correction. We also summarize the major applications of this strategy
+and conclude by discussing future directions and challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Ma, Mianzhi Pan, Wenhan Wu, Kanzhi Cheng, Jianbing Zhang, Shujian Huang, Jiajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have shown impressive performance in
+substantial downstream multi-modal tasks. However, only comparing the
+fine-tuned performance on downstream tasks leads to the poor interpretability
+of VLMs, which is adverse to their future improvement. Several prior works have
+identified this issue and used various probing methods under a zero-shot
+setting to detect VLMs' limitations, but they all examine VLMs using general
+datasets instead of specialized ones. In practical applications, VLMs are
+usually applied to specific scenarios, such as e-commerce and news fields, so
+the generalization of VLMs in specific domains should be given more attention.
+In this paper, we comprehensively investigate the capabilities of popular VLMs
+in a specific field, the food domain. To this end, we build a food caption
+dataset, Food-500 Cap, which contains 24,700 food images with 494 categories.
+Each image is accompanied by a detailed caption, including fine-grained
+attributes of food, such as the ingredient, shape, and color. We also provide a
+culinary culture taxonomy that classifies each food category based on its
+geographic origin in order to better analyze the performance differences of VLM
+in different regions. Experiments on our proposed datasets demonstrate that
+popular VLMs underperform in the food domain compared with their performance in
+the general domain. Furthermore, our research reveals severe bias in VLMs'
+ability to handle food items from different geographic regions. We adopt
+diverse probing methods and evaluate nine VLMs belonging to different
+architectures to verify the aforementioned observations. We hope that our study
+will bring researchers' attention to VLM's limitations when applying them to
+the domain of food or culinary cultures, and spur further investigations to
+address this issue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia (ACMMM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Multiple References Era -- Addressing Data Leakage and Limited
+  Reference Diversity in NLG Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianfeng Zeng, Yijin Liu, Fandong Meng, Jie Zho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely
+utilized across a range of natural language generation (NLG) tasks. However,
+recent studies have revealed a weak correlation between these matching-based
+metrics and human evaluations, especially when compared with neural-based
+metrics like BLEURT. In this paper, we conjecture that the performance
+bottleneck in matching-based metrics may be caused by the limited diversity of
+references. To address this issue, we propose to utilize \textit{multiple
+references} to enhance the consistency between these metrics and human
+evaluations. Within the WMT Metrics benchmarks, we observe that the
+multi-references F200spBLEU surpasses the conventional single-reference one by
+an accuracy improvement of 7.2\%. Remarkably, it also exceeds the neural-based
+BERTscore by an accuracy enhancement of 3.9\%. Moreover, we observe that the
+data leakage issue in large language models (LLMs) can be mitigated to a large
+extent by our multi-reference metric. We release the code and data at
+\url{https://github.com/SefaZeng/LLM-Ref}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Kurosawa": A Script Writer's Assistant 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prerak Gandhi, Vishal Pramanik, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Storytelling is the lifeline of the entertainment industry -- movies, TV
+shows, and stand-up comedies, all need stories. A good and gripping script is
+the lifeline of storytelling and demands creativity and resource investment.
+Good scriptwriters are rare to find and often work under severe time pressure.
+Consequently, entertainment media are actively looking for automation. In this
+paper, we present an AI-based script-writing workbench called KUROSAWA which
+addresses the tasks of plot generation and script generation. Plot generation
+aims to generate a coherent and creative plot (600-800 words) given a prompt
+(15-40 words). Script generation, on the other hand, generates a scene (200-500
+words) in a screenplay format from a brief description (15-40 words). Kurosawa
+needs data to train. We use a 4-act structure of storytelling to annotate the
+plot dataset manually. We create a dataset of 1000 manually annotated plots and
+their corresponding prompts/storylines and a gold-standard dataset of 1000
+scenes with four main elements -- scene headings, action lines, dialogues, and
+character names -- tagged individually. We fine-tune GPT-3 with the above
+datasets to generate plots and scenes. These plots and scenes are first
+evaluated and then used by the scriptwriters of a large and famous media
+platform ErosNow. We release the annotated datasets and the models trained on
+these datasets as a working benchmark for automatic movie plot and script
+generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 9 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Sum: Parameter-Efficient Controllable Abstractive Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Ravaut, Hailin Chen, Ruochen Zhao, Chengwei Qin, Shafiq Joty, Nancy Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning (PT), a parameter-efficient technique that only tunes the
+additional prompt embeddings while keeping the backbone pre-trained language
+model (PLM) frozen, has shown promising results in language understanding
+tasks, especially in low-resource scenarios. However, effective prompt design
+methods suitable for generation tasks such as summarization are still lacking.
+At the same time, summarization guided through instructions (discrete prompts)
+can achieve a desirable double objective of high quality and controllability in
+summary generation. Towards a goal of strong summarization performance under
+the triple conditions of parameter-efficiency, data-efficiency, and
+controllability, we introduce PromptSum, a method combining PT with a
+multi-task objective and discrete entity prompts for abstractive summarization.
+Our model achieves competitive ROUGE results on popular abstractive
+summarization benchmarks coupled with a strong level of controllability through
+entities, all while only tuning several orders of magnitude less parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Domain-Specific Retrieval by NLI Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Dušek, Aleksander Wawer, Christopher Galias, Lidia Wojciechowska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this article is to investigate the fine-tuning potential of
+natural language inference (NLI) data to improve information retrieval and
+ranking. We demonstrate this for both English and Polish languages, using data
+from one of the largest Polish e-commerce sites and selected open-domain
+datasets. We employ both monolingual and multilingual sentence encoders
+fine-tuned by a supervised method utilizing contrastive loss and NLI data. Our
+results point to the fact that NLI fine-tuning increases the performance of the
+models in both tasks and both languages, with the potential to improve mono-
+and multilingual models. Finally, we investigate uniformity and alignment of
+the embeddings to explain the effect of NLI-based fine-tuning for an
+out-of-domain use-case.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LARCH: Large Language Model-based Automatic Readme Creation with
+  Heuristics <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuta Koreeda, Terufumi Morishita, Osamu Imaichi, Yasuhiro Sogawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Writing a readme is a crucial aspect of software development as it plays a
+vital role in managing and reusing program code. Though it is a pain point for
+many developers, automatically creating one remains a challenge even with the
+recent advancements in large language models (LLMs), because it requires
+generating abstract description from thousands of lines of code. In this demo
+paper, we show that LLMs are capable of generating a coherent and factually
+correct readmes if we can identify a code fragment that is representative of
+the repository. Building upon this finding, we developed LARCH (LLM-based
+Automatic Readme Creation with Heuristics) which leverages representative code
+identification with heuristics and weak supervision. Through human and
+automated evaluations, we illustrate that LARCH can generate coherent and
+factually correct readmes in the majority of cases, outperforming a baseline
+that does not rely on representative code identification. We have made LARCH
+open-source and provided a cross-platform Visual Studio Code interface and
+command-line interface, accessible at https://github.com/hitachi-nlp/larch . A
+demo video showcasing LARCH's capabilities is available at
+https://youtu.be/ZUKkh5ED-O4 .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CIKM'23 Demo (This is a submitted version before camera
+  ready)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ System-Initiated Transitions from Chit-Chat to Task-Oriented Dialogues
+  with Transition Info Extractor and Transition Sentence Generator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Liu, Stefan Ultes, Wolfgang Minker, Wolfgang Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study dialogue scenarios that start from chit-chat but
+eventually switch to task-related services, and investigate how a unified
+dialogue model, which can engage in both chit-chat and task-oriented dialogues,
+takes the initiative during the dialogue mode transition from chit-chat to
+task-oriented in a coherent and cooperative manner. We firstly build a
+{transition info extractor} (TIE) that keeps track of the preceding chit-chat
+interaction and detects the potential user intention to switch to a
+task-oriented service. Meanwhile, in the unified model, a {transition sentence
+generator} (TSG) is extended through efficient Adapter tuning and transition
+prompt learning. When the TIE successfully finds task-related information from
+the preceding chit-chat, such as a transition domain, then the TSG is activated
+automatically in the unified model to initiate this transition by generating a
+transition sentence under the guidance of transition information extracted by
+TIE. The experimental results show promising performance regarding the
+proactive transitions. We achieve an additional large improvement on TIE model
+by utilizing Conditional Random Fields (CRF). The TSG can flexibly generate
+transition sentences while maintaining the unified capabilities of normal
+chit-chat and task-oriented response generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TARJAMAT: Evaluation of Bard and Chat<span class="highlight-title">GPT</span> on Machine Translation of Ten
+  Arabic Varieties 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karima Kadaoui, Samar M. Magdy, Abdul Waheed, Md Tawkat Islam Khondaker, Ahmed Oumar El-Shangiti, El Moatez Billah Nagoudi, Muhammad Abdul-Mageed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) finetuned to follow human instructions have
+recently emerged as a breakthrough in AI. Models such as Google Bard and OpenAI
+ChatGPT, for example, are surprisingly powerful tools for question answering,
+code debugging, and dialogue generation. Despite the purported multilingual
+proficiency of these models, their linguistic inclusivity remains
+insufficiently explored. Considering this constraint, we present a thorough
+assessment of Bard and ChatGPT (encompassing both GPT-3.5 and GPT-4) regarding
+their machine translation proficiencies across ten varieties of Arabic. Our
+evaluation covers diverse Arabic varieties such as Classical Arabic, Modern
+Standard Arabic, and several nuanced dialectal variants. Furthermore, we
+undertake a human-centric study to scrutinize the efficacy of the most recent
+model, Bard, in following human instructions during translation tasks. Our
+exhaustive analysis indicates that LLMs may encounter challenges with certain
+Arabic dialects, particularly those for which minimal public data exists, such
+as Algerian and Mauritanian dialects. However, they exhibit satisfactory
+performance with more prevalent dialects, albeit occasionally trailing behind
+established commercial systems like Google Translate. Additionally, our
+analysis reveals a circumscribed capability of Bard in aligning with human
+instructions in translation contexts. Collectively, our findings underscore
+that prevailing LLMs remain far from inclusive, with only limited ability to
+cater for the linguistic and cultural intricacies of diverse communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-EX : A Unified <span class="highlight-title">Dataset</span> of Definitions and Dictionary Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemah Almeman, Hadi Sheikhi, Luis Espinosa-Anke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Definitions are a fundamental building block in lexicography, linguistics and
+computational semantics. In NLP, they have been used for retrofitting word
+embeddings or augmenting contextual representations in language models.
+However, lexical resources containing definitions exhibit a wide range of
+properties, which has implications in the behaviour of models trained and
+evaluated on them. In this paper, we introduce 3D- EX , a dataset that aims to
+fill this gap by combining well-known English resources into one centralized
+knowledge repository in the form of <term, definition, example> triples. 3D- EX
+is a unified evaluation framework with carefully pre-computed
+train/validation/test splits to prevent memorization. We report experimental
+results that suggest that this dataset could be effectively leveraged in
+downstream NLP tasks. Code and data are available at
+https://github.com/F-Almeman/3D-EX .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages (including references pages), 9 tables, and 1 figure. This
+  paper is submitted to RANLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Scene-Text to Scene-Text Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Onkar Susladkar, Prajwal Gatti, Anand Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study the task of ``visually" translating scene text from a
+source language (e.g., English) to a target language (e.g., Chinese). Visual
+translation involves not just the recognition and translation of scene text but
+also the generation of the translated image that preserves visual features of
+the text, such as font, size, and background. There are several challenges
+associated with this task, such as interpolating font to unseen characters and
+preserving text size and the background. To address these, we introduce VTNet,
+a novel conditional diffusion-based method. To train the VTNet, we create a
+synthetic cross-lingual dataset of 600K samples of scene text images in six
+popular languages, including English, Hindi, Tamil, Chinese, Bengali, and
+German. We evaluate the performance of VTnet through extensive experiments and
+comparisons to related methods. Our model also surpasses the previous
+state-of-the-art results on the conventional scene-text editing benchmarks.
+Further, we present rigorous qualitative studies to understand the strengths
+and shortcomings of our model. Results show that our approach generalizes well
+to unseen words and fonts. We firmly believe our work can benefit real-world
+applications, such as text translation using a phone camera and translating
+educational materials. Code and data will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spanish <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">BERT</span> Model and Evaluation Data <span class="chip">ICLR 2020</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Cañete, Gabriel Chaperon, Rodrigo Fuentes, Jou-Hui Ho, Hojin Kang, Jorge Pérez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Spanish language is one of the top 5 spoken languages in the world.
+Nevertheless, finding resources to train or evaluate Spanish language models is
+not an easy task. In this paper we help bridge this gap by presenting a
+BERT-based language model pre-trained exclusively on Spanish data. As a second
+contribution, we also compiled several tasks specifically for the Spanish
+language in a single repository much in the spirit of the GLUE benchmark. By
+fine-tuning our pre-trained Spanish model, we obtain better results compared to
+other BERT-based models pre-trained on multilingual corpora for most of the
+tasks, even achieving a new state-of-the-art on some of them. We have publicly
+released our model, the pre-training data, and the compilation of the Spanish
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as workshop paper at Practical ML for Developing Countries
+  Workshop @ ICLR 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Vision-Language <span class="highlight-title">Pre-Train</span>ing with Jointly Learned Questioner
+  and Dense Captioner <span class="chip">ACM MM '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11769v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11769v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikang Liu, Sihan Chen, Longteng Guo, Handong Li, Xingjian He, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained multimodal models have demonstrated significant success in
+a range of downstream tasks, including image captioning, image-text retrieval,
+visual question answering (VQA), etc. However, many of these methods rely on
+image-text pairs collected from the web as pre-training data and unfortunately
+overlook the need for fine-grained feature alignment between vision and
+language modalities, which requires detailed understanding of images and
+language expressions. While integrating VQA and dense captioning (DC) into
+pre-training can address this issue, acquiring image-question-answer as well as
+image-location-caption triplets is challenging and time-consuming.
+Additionally, publicly available datasets for VQA and dense captioning are
+typically limited in scale due to manual data collection and labeling efforts.
+In this paper, we propose a novel method called Joint QA and DC GEneration
+(JADE), which utilizes a pre-trained multimodal model and easily-crawled
+image-text pairs to automatically generate and filter large-scale VQA and dense
+captioning datasets. We apply this method to the Conceptual Caption (CC3M)
+dataset to generate a new dataset called CC3M-QA-DC. Experiments show that when
+used for pre-training in a multi-task manner, CC3M-QA-DC can improve the
+performance with various backbones on various downstream tasks. Furthermore,
+our generated CC3M-QA-DC can be combined with larger image-text datasets (e.g.,
+CC15M) and achieve competitive results compared with models using much more
+data. Code and dataset are available at
+https://github.com/johncaged/OPT_Questioner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. Accepted by ACM MM '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-based Knowledge Augmented Vision Language <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahua Rao, Zifei Shan, Longpo Liu, Yao Zhou, Yuedong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent progress in large-scale vision and language representation
+learning, Vision Language Pre-training (VLP) models have achieved promising
+improvements on various multi-modal downstream tasks. Albeit powerful, these
+models have not fully leveraged world knowledge to their advantage. A key
+challenge of knowledge-augmented VLP is the lack of clear connections between
+knowledge and multi-modal data. Moreover, not all knowledge present in
+images/texts is useful, therefore prior approaches often struggle to
+effectively integrate knowledge, visual, and textual information. In this
+study, we propose REtrieval-based knowledge Augmented Vision Language (REAVL),
+a novel knowledge-augmented pre-training framework to address the above issues.
+For the first time, we introduce a knowledge-aware self-supervised learning
+scheme that efficiently establishes the correspondence between knowledge and
+multi-modal data and identifies informative knowledge to improve the modeling
+of alignment and interactions between visual and textual modalities. By
+adaptively integrating informative knowledge with visual and textual
+information, REAVL achieves new state-of-the-art performance uniformly on
+knowledge-based vision-language understanding and multi-modal entity linking
+tasks, as well as competitive results on general vision-language tasks while
+only using 0.2% pre-training data of the best models. Our model shows strong
+sample efficiency and effective knowledge utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2210.09338 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taxonomy of Abstractive Dialogue Summarization: Scenarios, Approaches
+  and Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09894v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09894v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Jia, Yizhu Liu, Siyu Ren, Kenny Q. Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstractive dialogue summarization is to generate a concise and fluent
+summary covering the salient information in a dialogue among two or more
+interlocutors. It has attracted great attention in recent years based on the
+massive emergence of social communication platforms and an urgent requirement
+for efficient dialogue information understanding and digestion. Different from
+news or articles in traditional document summarization, dialogues bring unique
+characteristics and additional challenges, including different language styles
+and formats, scattered information, flexible discourse structures and unclear
+topic boundaries. This survey provides a comprehensive investigation on
+existing work for abstractive dialogue summarization from scenarios, approaches
+to evaluations. It categorizes the task into two broad categories according to
+the type of input dialogues, i.e., open-domain and task-oriented, and presents
+a taxonomy of existing techniques in three directions, namely, injecting
+dialogue features, designing auxiliary training tasks and using additional
+data.A list of datasets under different scenarios and widely-accepted
+evaluation metrics are summarized for completeness. After that, the trends of
+scenarios and techniques are summarized, together with deep insights on
+correlations between extensively exploited features and different scenarios.
+Based on these analyses, we recommend future directions including more
+controlled and complicated scenarios, technical innovations and comparisons,
+publicly available datasets in special domains, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at ACM Computing Surveys (CSUR), submitted in January
+  2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving <span class="highlight-title">Pre-train</span>ed Language Models' Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10457v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10457v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somayeh Ghanbarzadeh, Hamid Palangi, Yan Huang, Radames Cruz Moreno, Hamed Khanpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reusability of state-of-the-art Pre-trained Language Models (PLMs) is
+often limited by their generalization problem, where their performance
+drastically decreases when evaluated on examples that differ from the training
+dataset, known as Out-of-Distribution (OOD)/unseen examples. This limitation
+arises from PLMs' reliance on spurious correlations, which work well for
+frequent example types but not for general examples. To address this issue, we
+propose a training approach called Mask-tuning, which integrates Masked
+Language Modeling (MLM) training objectives into the fine-tuning process to
+enhance PLMs' generalization. Comprehensive experiments demonstrate that
+Mask-tuning surpasses current state-of-the-art techniques and enhances PLMs'
+generalization on OOD datasets while improving their performance on
+in-distribution datasets. The findings suggest that Mask-tuning improves the
+reusability of PLMs on unseen data, making them more practical and effective
+for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An updated version being uploaded with a new title as: "Improving the
+  Reusability of Pre-trained Language Models in Real-world Applications"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Natural Language Processing for Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingfu Zhu, Xianzhen Luo, Fang Liu, Cuiyun Gao, Wanxiang Che
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language processing for programming aims to use NLP techniques to
+assist programming. It is increasingly prevalent for its effectiveness in
+improving productivity. Distinct from natural language, a programming language
+is highly structured and functional. Constructing a structure-based
+representation and a functionality-oriented algorithm is at the heart of
+program understanding and generation. In this paper, we conduct a systematic
+review covering tasks, datasets, evaluation methods, techniques, and models
+from the perspective of the structure-based and functionality-oriented
+property, aiming to understand the role of the two properties in each
+component. Based on the analysis, we illustrate unexplored areas and suggest
+potential directions for future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">50</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Consensus Enhanced Siamese Network with Reciprocal Loss for
+  Two-view Correspondence Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linbo Wang, Jing Wu, Xianyong Fang, Zhengyi Liu, Chenjie Cao, Yanwei Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies of two-view correspondence learning usually establish an
+end-to-end network to jointly predict correspondence reliability and relative
+pose. We improve such a framework from two aspects. First, we propose a Local
+Feature Consensus (LFC) plugin block to augment the features of existing
+models. Given a correspondence feature, the block augments its neighboring
+features with mutual neighborhood consensus and aggregates them to produce an
+enhanced feature. As inliers obey a uniform cross-view transformation and share
+more consistent learned features than outliers, feature consensus strengthens
+inlier correlation and suppresses outlier distraction, which makes output
+features more discriminative for classifying inliers/outliers. Second, existing
+approaches supervise network training with the ground truth correspondences and
+essential matrix projecting one image to the other for an input image pair,
+without considering the information from the reverse mapping. We extend
+existing models to a Siamese network with a reciprocal loss that exploits the
+supervision of mutual projection, which considerably promotes the matching
+performance without introducing additional model parameters. Building upon
+MSA-Net, we implement the two proposals and experimentally achieve
+state-of-the-art performance on benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Microvasculature Segmentation in Human BioMolecular Atlas Program
+  (HuBMAP) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Sultan, Yongqiang Wang, James Scanlon, Lisa D'lima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image segmentation serves as a critical tool across a range of applications,
+encompassing autonomous driving's pedestrian detection and pre-operative tumor
+delineation in the medical sector. Among these applications, we focus on the
+National Institutes of Health's (NIH) Human BioMolecular Atlas Program
+(HuBMAP), a significant initiative aimed at creating detailed cellular maps of
+the human body. In this study, we concentrate on segmenting various
+microvascular structures in human kidneys, utilizing 2D Periodic Acid-Schiff
+(PAS)-stained histology images. Our methodology begins with a foundational
+FastAI U-Net model, upon which we investigate alternative backbone
+architectures, delve into deeper models, and experiment with Feature Pyramid
+Networks. We rigorously evaluate these varied approaches by benchmarking their
+performance against our baseline U-Net model. This study thus offers a
+comprehensive exploration of cutting-edge segmentation techniques, providing
+valuable insights for future research in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Source-free Domain Adaptive Human Pose Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qucheng Peng, Ce Zheng, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Pose Estimation (HPE) is widely used in various fields, including
+motion analysis, healthcare, and virtual reality. However, the great expenses
+of labeled real-world datasets present a significant challenge for HPE. To
+overcome this, one approach is to train HPE models on synthetic datasets and
+then perform domain adaptation (DA) on real-world data. Unfortunately, existing
+DA methods for HPE neglect data privacy and security by using both source and
+target data in the adaptation process.
+  To this end, we propose a new task, named source-free domain adaptive HPE,
+which aims to address the challenges of cross-domain learning of HPE without
+access to source data during the adaptation process. We further propose a novel
+framework that consists of three models: source model, intermediate model, and
+target model, which explores the task from both source-protect and
+target-relevant perspectives. The source-protect module preserves source
+information more effectively while resisting noise, and the target-relevant
+module reduces the sparsity of spatial representations by building a novel
+spatial probability space, and pose-specific contrastive learning and
+information maximization are proposed on the basis of this space. Comprehensive
+experiments on several domain adaptive HPE benchmarks show that the proposed
+method outperforms existing approaches by a considerable margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unmasking the Invisible: Finding Location-Specific Aggregated Air
+  Quality Index with Smartphone-Captured Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joyanta Jyoti Mondal, Md. Farhadul Islam, Raima Islam, Nowsin Kabir Rhidi, A. B. M. Alim Al Islam, Meem Arafat Manab, Jannatun Noor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalence and mobility of smartphones make these a widely used tool for
+environmental health research. However, their potential for determining
+aggregated air quality index (AQI) based on PM2.5 concentration in specific
+locations remains largely unexplored in the existing literature. In this paper,
+we thoroughly examine the challenges associated with predicting
+location-specific PM2.5 concentration using images taken with smartphone
+cameras. The focus of our study is on Dhaka, the capital of Bangladesh, due to
+its significant air pollution levels and the large population exposed to it.
+Our research involves the development of a Deep Convolutional Neural Network
+(DCNN), which we train using over a thousand outdoor images taken and
+annotated. These photos are captured at various locations in Dhaka, and their
+labels are based on PM2.5 concentration data obtained from the local US
+consulate, calculated using the NowCast algorithm. Through supervised learning,
+our model establishes a correlation index during training, enhancing its
+ability to function as a Picture-based Predictor of PM2.5 Concentration (PPPC).
+This enables the algorithm to calculate an equivalent daily averaged AQI index
+from a smartphone image. Unlike, popular overly parameterized models, our model
+shows resource efficiency since it uses fewer parameters. Furthermore, test
+results indicate that our model outperforms popular models like ViT and INN, as
+well as popular CNN-based models such as VGG19, ResNet50, and MobileNetV2, in
+predicting location-specific PM2.5 concentration. Our dataset is the first
+publicly available collection that includes atmospheric images and
+corresponding PM2.5 measurements from Dhaka. Our code and dataset will be made
+public when publishing the paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures, submitted to Nature Scientific Reports</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Syn-Mediverse: A Multimodal Synthetic <span class="highlight-title">Dataset</span> for Intelligent Scene
+  Understanding of Healthcare Facilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Mohan, José Arce, Sassan Mokhtar, Daniele Cattaneo, Abhinav Valada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety and efficiency are paramount in healthcare facilities where the lives
+of patients are at stake. Despite the adoption of robots to assist medical
+staff in challenging tasks such as complex surgeries, human expertise is still
+indispensable. The next generation of autonomous healthcare robots hinges on
+their capacity to perceive and understand their complex and frenetic
+environments. While deep learning models are increasingly used for this
+purpose, they require extensive annotated training data which is impractical to
+obtain in real-world healthcare settings. To bridge this gap, we present
+Syn-Mediverse, the first hyper-realistic multimodal synthetic dataset of
+diverse healthcare facilities. Syn-Mediverse contains over \num{48000} images
+from a simulated industry-standard optical tracking camera and provides more
+than 1.5M annotations spanning five different scene understanding tasks
+including depth estimation, object detection, semantic segmentation, instance
+segmentation, and panoptic segmentation. We demonstrate the complexity of our
+dataset by evaluating the performance on a broad range of state-of-the-art
+baselines for each task. To further advance research on scene understanding of
+healthcare facilities, along with the public dataset we provide an online
+evaluation benchmark available at \url{http://syn-mediverse.cs.uni-freiburg.de}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Biometric Entropy and Iris Capacity: Avoiding Identity
+  Collisions on National Scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Daugman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The numbers of persons who can be enrolled by their iris patterns with no
+identity collisions is studied in relation to the biometric entropy extracted,
+and the decision operating threshold. The population size at which identity
+collision becomes likelier than not, given those variables, defines iris
+"capacity." The general solution to this combinatorial problem is derived, in
+analogy with the well-known "birthday problem." Its application to unique
+biometric identification on national population scales is shown, referencing
+empirical data from US NIST (National Institute of Standards and Technology)
+trials involving 1.2 trillion (1.2 x 10^(12) ) iris comparisons. The entropy of
+a given person's two iris patterns suffices for global identity uniqueness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Photorealistic and Identity-Preserving Image-Based Emotion Manipulation
+  with Latent Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Pikoulis, Panagiotis P. Filntisis, Petros Maragos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the emotion manipulation capabilities of
+diffusion models with "in-the-wild" images, a rather unexplored application
+area relative to the vast and rapidly growing literature for image-to-image
+translation tasks. Our proposed method encapsulates several pieces of prior
+work, with the most important being Latent Diffusion models and text-driven
+manipulation with CLIP latents. We conduct extensive qualitative and
+quantitative evaluations on AffectNet, demonstrating the superiority of our
+approach in terms of image quality and realism, while achieving competitive
+results relative to emotion translation compared to a variety of GAN-based
+counterparts. Code is released as a publicly available repo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 tables, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Few-shot 3D Point Cloud Segmentation via Query-Guided
+  Enhancement <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhua Ning, Zhuotao Tian, Guangming Lu, Wenjie Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although extensive research has been conducted on 3D point cloud
+segmentation, effectively adapting generic models to novel categories remains a
+formidable challenge. This paper proposes a novel approach to improve point
+cloud few-shot segmentation (PC-FSS) models. Unlike existing PC-FSS methods
+that directly utilize categorical information from support prototypes to
+recognize novel classes in query samples, our method identifies two critical
+aspects that substantially enhance model performance by reducing contextual
+gaps between support prototypes and query features. Specifically, we (1) adapt
+support background prototypes to match query context while removing extraneous
+cues that may obscure foreground and background in query samples, and (2)
+holistically rectify support prototypes under the guidance of query features to
+emulate the latter having no semantic gap to the query targets. Our proposed
+designs are agnostic to the feature extractor, rendering them readily
+applicable to any prototype-based methods. The experimental results on S3DIS
+and ScanNet demonstrate notable practical benefits, as our approach achieves
+significant improvements while still maintaining high efficiency. The code for
+our approach is available at
+https://github.com/AaronNZH/Boosting-Few-shot-3D-Point-Cloud-Segmentation-via-Query-Guided-Enhancement
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strategic Preys Make Acute Predators: Enhancing Camouflaged Object
+  Detectors by Generating Camouflaged Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunming He, Kai Li, Yachao Zhang, Yulun Zhang, Zhenhua Guo, Xiu Li, Martin Danelljan, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection (COD) is the challenging task of identifying
+camouflaged objects visually blended into surroundings. Albeit achieving
+remarkable success, existing COD detectors still struggle to obtain precise
+results in some challenging cases. To handle this problem, we draw inspiration
+from the prey-vs-predator game that leads preys to develop better camouflage
+and predators to acquire more acute vision systems and develop algorithms from
+both the prey side and the predator side. On the prey side, we propose an
+adversarial training framework, Camouflageator, which introduces an auxiliary
+generator to generate more camouflaged objects that are harder for a COD method
+to detect. Camouflageator trains the generator and detector in an adversarial
+way such that the enhanced auxiliary generator helps produce a stronger
+detector. On the predator side, we introduce a novel COD method, called
+Internal Coherence and Edge Guidance (ICEG), which introduces a camouflaged
+feature coherence module to excavate the internal coherence of camouflaged
+objects, striving to obtain more complete segmentation results. Additionally,
+ICEG proposes a novel edge-guided separated calibration module to remove false
+predictions to avoid obtaining ambiguous boundaries. Extensive experiments show
+that ICEG outperforms existing COD detectors and Camouflageator is flexible to
+improve various COD detectors, including ICEG, which brings state-of-the-art
+COD performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FireFly A Synthetic <span class="highlight-title">Dataset</span> for Ember Detection in Wildfire <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Hu, Xinan Ye, Yifei Liu, Souvik Kundu, Gourav Datta, Srikar Mutnuri, Namo Asavisanu, Nora Ayanian, Konstantinos Psounis, Peter Beerel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents "FireFly", a synthetic dataset for ember detection
+created using Unreal Engine 4 (UE4), designed to overcome the current lack of
+ember-specific training resources. To create the dataset, we present a tool
+that allows the automated generation of the synthetic labeled dataset with
+adjustable parameters, enabling data diversity from various environmental
+conditions, making the dataset both diverse and customizable based on user
+requirements. We generated a total of 19,273 frames that have been used to
+evaluate FireFly on four popular object detection models. Further to minimize
+human intervention, we leveraged a trained model to create a semi-automatic
+labeling process for real-life ember frames. Moreover, we demonstrated an up to
+8.57% improvement in mean Average Precision (mAP) in real-world wildfire
+scenarios compared to models trained exclusively on a small real dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Artificial Intelligence (AI) and Humanitarian Assistance and Disaster
+  Recovery (HADR) workshop, ICCV 2023 in Paris, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CGBA: Curvature-aware Geometric Black-box Attack <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Farhamdur Reza, Ali Rahmati, Tianfu Wu, Huaiyu Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision-based black-box attacks often necessitate a large number of queries
+to craft an adversarial example. Moreover, decision-based attacks based on
+querying boundary points in the estimated normal vector direction often suffer
+from inefficiency and convergence issues. In this paper, we propose a novel
+query-efficient curvature-aware geometric decision-based black-box attack
+(CGBA) that conducts boundary search along a semicircular path on a restricted
+2D plane to ensure finding a boundary point successfully irrespective of the
+boundary curvature. While the proposed CGBA attack can work effectively for an
+arbitrary decision boundary, it is particularly efficient in exploiting the low
+curvature to craft high-quality adversarial examples, which is widely seen and
+experimentally verified in commonly used classifiers under non-targeted
+attacks. In contrast, the decision boundaries often exhibit higher curvature
+under targeted attacks. Thus, we develop a new query-efficient variant, CGBA-H,
+that is adapted for the targeted attack. In addition, we further design an
+algorithm to obtain a better initial boundary point at the expense of some
+extra queries, which considerably enhances the performance of the targeted
+attack. Extensive experiments are conducted to evaluate the performance of our
+proposed methods against some well-known classifiers on the ImageNet and
+CIFAR10 datasets, demonstrating the superiority of CGBA and CGBA-H over
+state-of-the-art non-targeted and targeted attacks, respectively. The source
+code is available at https://github.com/Farhamdur/CGBA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted to publish in ICCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Ma, Mianzhi Pan, Wenhan Wu, Kanzhi Cheng, Jianbing Zhang, Shujian Huang, Jiajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models (VLMs) have shown impressive performance in
+substantial downstream multi-modal tasks. However, only comparing the
+fine-tuned performance on downstream tasks leads to the poor interpretability
+of VLMs, which is adverse to their future improvement. Several prior works have
+identified this issue and used various probing methods under a zero-shot
+setting to detect VLMs' limitations, but they all examine VLMs using general
+datasets instead of specialized ones. In practical applications, VLMs are
+usually applied to specific scenarios, such as e-commerce and news fields, so
+the generalization of VLMs in specific domains should be given more attention.
+In this paper, we comprehensively investigate the capabilities of popular VLMs
+in a specific field, the food domain. To this end, we build a food caption
+dataset, Food-500 Cap, which contains 24,700 food images with 494 categories.
+Each image is accompanied by a detailed caption, including fine-grained
+attributes of food, such as the ingredient, shape, and color. We also provide a
+culinary culture taxonomy that classifies each food category based on its
+geographic origin in order to better analyze the performance differences of VLM
+in different regions. Experiments on our proposed datasets demonstrate that
+popular VLMs underperform in the food domain compared with their performance in
+the general domain. Furthermore, our research reveals severe bias in VLMs'
+ability to handle food items from different geographic regions. We adopt
+diverse probing methods and evaluate nine VLMs belonging to different
+architectures to verify the aforementioned observations. We hope that our study
+will bring researchers' attention to VLM's limitations when applying them to
+the domain of food or culinary cultures, and spur further investigations to
+address this issue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia (ACMMM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PNN: From proximal algorithms to robust unfolded image denoising
+  networks and Plug-and-Play methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoang Trieu Vy Le, Audrey Repetti, Nelly Pustelnik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to solve inverse imaging problems relies on finding a
+maximum a posteriori (MAP) estimate of the original unknown image, by solving a
+minimization problem. In thiscontext, iterative proximal algorithms are widely
+used, enabling to handle non-smooth functions and linear operators. Recently,
+these algorithms have been paired with deep learning strategies, to further
+improve the estimate quality. In particular, proximal neural networks (PNNs)
+have been introduced, obtained by unrolling a proximal algorithm as for finding
+a MAP estimate, but over a fixed number of iterations, with learned linear
+operators and parameters. As PNNs are based on optimization theory, they are
+very flexible, and can be adapted to any image restoration task, as soon as a
+proximal algorithm can solve it. They further have much lighter architectures
+than traditional networks. In this article we propose a unified framework to
+build PNNs for the Gaussian denoising task, based on both the dual-FB and the
+primal-dual Chambolle-Pock algorithms. We further show that accelerated
+inertial versions of these algorithms enable skip connections in the associated
+NN layers. We propose different learning strategies for our PNN framework, and
+investigate their robustness (Lipschitz property) and denoising efficiency.
+Finally, we assess the robustness of our PNNs when plugged in a
+forward-backward algorithm for an image deblurring problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ E-CLIP: Towards Label-efficient Event-based Open-world Understanding by
+  CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhou Zhou, Xu Zheng, Yuanhuiyi Lyu, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrasting Language-image pertaining (CLIP) has recently shown promising
+open-world and few-shot performance on 2D image-based recognition tasks.
+However, the transferred capability of CLIP to the novel event camera data
+still remains under-explored. In particular, due to the modality gap with the
+image-text data and the lack of large-scale datasets, achieving this goal is
+non-trivial and thus requires significant research innovation. In this paper,
+we propose E-CLIP, a novel and effective framework that unleashes the potential
+of CLIP for event-based recognition to compensate for the lack of large-scale
+event-based datasets. Our work addresses two crucial challenges: 1) how to
+generalize CLIP's visual encoder to event data while fully leveraging events'
+unique properties, e.g., sparsity and high temporal resolution; 2) how to
+effectively align the multi-modal embeddings, i.e., image, text, and events. To
+this end, we first introduce a novel event encoder that subtly models the
+temporal information from events and meanwhile generates event prompts to
+promote the modality bridging. We then design a text encoder that generates
+content prompts and utilizes hybrid text prompts to enhance the E-CLIP's
+generalization ability across diverse datasets. With the proposed event
+encoder, text encoder, and original image encoder, a novel Hierarchical Triple
+Contrastive Alignment (HTCA) module is introduced to jointly optimize the
+correlation and enable efficient knowledge transfer among the three modalities.
+We conduct extensive experiments on two recognition benchmarks, and the results
+demonstrate that our E-CLIP outperforms existing methods by a large margin of
++3.94% and +4.62% on the N-Caltech dataset, respectively, in both fine-tuning
+and few-shot settings. Moreover, our E-CLIP can be flexibly extended to the
+event retrieval task using both text or image queries, showing plausible
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NNVISR: Bring Neural Network Video Interpolation and Super Resolution
+  into Video Processing Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Tong, Mengshun Hu, Zheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NNVISR - an open-source filter plugin for the VapourSynth video
+processing framework, which facilitates the application of neural networks for
+various kinds of video enhancing tasks, including denoising, super resolution,
+interpolation, and spatio-temporal super-resolution. NNVISR fills the gap
+between video enhancement neural networks and video processing pipelines, by
+accepting any network that enhances a group of frames, and handling all other
+network agnostic details during video processing. NNVISR is publicly released
+at https://github.com/tongyuantongyu/vs-NNVISR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAAM: Stealthy Adversarial Attack on Monoculor Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amira Guesmi, Muhammad Abdullah Hanif, Bassem Ouni, Muhammad Shafique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the vulnerability of MDE to adversarial
+patches. We propose a novel \underline{S}tealthy \underline{A}dversarial
+\underline{A}ttacks on \underline{M}DE (SAAM) that compromises MDE by either
+corrupting the estimated distance or causing an object to seamlessly blend into
+its surroundings. Our experiments, demonstrate that the designed stealthy patch
+successfully causes a DNN-based MDE to misestimate the depth of objects. In
+fact, our proposed adversarial patch achieves a significant 60\% depth error
+with 99\% ratio of the affected region. Importantly, despite its adversarial
+nature, the patch maintains a naturalistic appearance, making it inconspicuous
+to human observers. We believe that this work sheds light on the threat of
+adversarial attacks in the context of MDE on edge devices. We hope it raises
+awareness within the community about the potential real-life harm of such
+attacks and encourages further research into developing more robust and
+adaptive defense mechanisms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating <span class="highlight-title">Pre-train</span>ing Data Matters in Unsupervised Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinsong Xu, Aidong Men, Yang Liu, Qingchao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation(UDA) and Source-free UDA(SFUDA) methods
+formulate the problem involving two domains: source and target. They typically
+employ a standard training approach that begins with models pre-trained on
+large-scale datasets e.g., ImageNet, while rarely discussing its effect.
+Recognizing this gap, we investigate the following research questions: (1) What
+is the correlation among ImageNet, the source, and the target domain? (2) How
+does pre-training on ImageNet influence the target risk? To answer the first
+question, we empirically observed an interesting Spontaneous Pulling (SP)
+Effect in fine-tuning where the discrepancies between any two of the three
+domains (ImageNet, Source, Target) decrease but at the cost of the impaired
+semantic structure of the pre-train domain. For the second question, we put
+forward a theory to explain SP and quantify that the target risk is bound by
+gradient disparities among the three domains. Our observations reveal a key
+limitation of existing methods: it hinders the adaptation performance if the
+semantic cluster structure of the pre-train dataset (i.e.ImageNet) is impaired.
+To address it, we incorporate ImageNet as the third domain and redefine the
+UDA/SFUDA as a three-player game. Specifically, inspired by the theory and
+empirical findings, we present a novel framework termed TriDA which
+additionally preserves the semantic structure of the pre-train dataset during
+fine-tuning. Experimental results demonstrate that it achieves state-of-the-art
+performance across various UDA and SFUDA benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECT: Fine-grained Edge Detection with Learned Cause Tokens 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaocong Xu, Xiaoxue Chen, Yuhang Zheng, Guyue Zhou, Yurong Chen, Hongbin Zha, Hao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we tackle the challenging fine-grained edge detection task,
+which refers to predicting specific edges caused by reflectance, illumination,
+normal, and depth changes, respectively. Prior methods exploit multi-scale
+convolutional networks, which are limited in three aspects: (1) Convolutions
+are local operators while identifying the cause of edge formation requires
+looking at far away pixels. (2) Priors specific to edge cause are fixed in
+prediction heads. (3) Using separate networks for generic and fine-grained edge
+detection, and the constraint between them may be violated. To address these
+three issues, we propose a two-stage transformer-based network sequentially
+predicting generic edges and fine-grained edges, which has a global receptive
+field thanks to the attention mechanism. The prior knowledge of edge causes is
+formulated as four learnable cause tokens in a cause-aware decoder design.
+Furthermore, to encourage the consistency between generic edges and
+fine-grained edges, an edge aggregation and alignment loss is exploited. We
+evaluate our method on the public benchmark BSDS-RIND and several newly derived
+benchmarks, and achieve new state-of-the-art results. Our code, data, and
+models are publicly available at https://github.com/Daniellli/ECT.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code available at https://github.com/Daniellli/ECT.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Study for Performance of MobileNetV1 and MobileNetV2 Based on Breast
+  Cancer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuqi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence is constantly evolving and can provide effective help
+in all aspects of people's lives. The experiment is mainly to study the use of
+artificial intelligence in the field of medicine. The purpose of this
+experiment was to compare which of MobileNetV1 and MobileNetV2 models was
+better at detecting histopathological images of the breast downloaded at
+Kaggle. When the doctor looks at the pathological image, there may be errors
+that lead to errors in judgment, and the observation speed is slow. Rational
+use of artificial intelligence can effectively reduce the error of doctor
+diagnosis in breast cancer judgment and speed up doctor diagnosis. The dataset
+was downloaded from Kaggle and then normalized. The basic principle of the
+experiment is to let the neural network model learn the downloaded data set.
+Then find the pattern and be able to judge on your own whether breast tissue is
+cancer. In the dataset, benign tumor pictures and malignant tumor pictures have
+been classified, of which 198738 are benign tumor pictures and 78, 786 are
+malignant tumor pictures. After calling MobileNetV1 and MobileNetV2, the
+dataset is trained separately, the training accuracy and validation accuracy
+rate are obtained, and the image is drawn. It can be observed that MobileNetV1
+has better validation accuracy and overfit during MobileNetV2 training. From
+the experimental results, it can be seen that in the case of processing this
+dataset, MobileNetV1 is much better than MobileNetV2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages,3 figures,CMLAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M$^3$Net: Multi-view Encoding, Matching, and Fusion for Few-shot
+  Fine-grained Action Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Tang, Jun Liu, Shuanglin Yan, Rui Yan, Zechao Li, Jinhui Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the scarcity of manually annotated data required for fine-grained
+video understanding, few-shot fine-grained (FS-FG) action recognition has
+gained significant attention, with the aim of classifying novel fine-grained
+action categories with only a few labeled instances. Despite the progress made
+in FS coarse-grained action recognition, current approaches encounter two
+challenges when dealing with the fine-grained action categories: the inability
+to capture subtle action details and the insufficiency of learning from limited
+data that exhibit high intra-class variance and inter-class similarity. To
+address these limitations, we propose M$^3$Net, a matching-based framework for
+FS-FG action recognition, which incorporates \textit{multi-view encoding},
+\textit{multi-view matching}, and \textit{multi-view fusion} to facilitate
+embedding encoding, similarity matching, and decision making across multiple
+viewpoints. \textit{Multi-view encoding} captures rich contextual details from
+the intra-frame, intra-video, and intra-episode perspectives, generating
+customized higher-order embeddings for fine-grained data. \textit{Multi-view
+matching} integrates various matching functions enabling flexible relation
+modeling within limited samples to handle multi-scale spatio-temporal
+variations by leveraging the instance-specific, category-specific, and
+task-specific perspectives. \textit{Multi-view fusion} consists of
+matching-predictions fusion and matching-losses fusion over the above views,
+where the former promotes mutual complementarity and the latter enhances
+embedding generalizability by employing multi-task collaborative learning.
+Explainable visualizations and experimental results on three challenging
+benchmarks demonstrate the superiority of M$^3$Net in capturing fine-grained
+action details and achieving state-of-the-art performance for FS-FG action
+recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InterTracker: Discovering and Tracking General Objects Interacting with
+  Hands in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyan Shao, Qi Ye, Wenhan Luo, Kaihao Zhang, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding human interaction with objects is an important research topic
+for embodied Artificial Intelligence and identifying the objects that humans
+are interacting with is a primary problem for interaction understanding.
+Existing methods rely on frame-based detectors to locate interacting objects.
+However, this approach is subjected to heavy occlusions, background clutter,
+and distracting objects. To address the limitations, in this paper, we propose
+to leverage spatio-temporal information of hand-object interaction to track
+interactive objects under these challenging cases. Without prior knowledge of
+the general objects to be tracked like object tracking problems, we first
+utilize the spatial relation between hands and objects to adaptively discover
+the interacting objects from the scene. Second, the consistency and continuity
+of the appearance of objects between successive frames are exploited to track
+the objects. With this tracking formulation, our method also benefits from
+training on large-scale general object-tracking datasets. We further curate a
+video-level hand-object interaction dataset for testing and evaluation from
+100DOH. The quantitative results demonstrate that our proposed method
+outperforms the state-of-the-art methods. Specifically, in scenes with
+continuous interaction with different objects, we achieve an impressive
+improvement of about 10% as evaluated using the Average Precision (AP) metric.
+Our qualitative findings also illustrate that our method can produce more
+continuous trajectories for interacting objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TOPIQ: A Top-down Approach from Semantics to Distortions for Image
+  Quality Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaofeng Chen, Jiadi Mo, Jingwen Hou, Haoning Wu, Liang Liao, Wenxiu Sun, Qiong Yan, Weisi Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Quality Assessment (IQA) is a fundamental task in computer vision that
+has witnessed remarkable progress with deep neural networks. Inspired by the
+characteristics of the human visual system, existing methods typically use a
+combination of global and local representations (\ie, multi-scale features) to
+achieve superior performance. However, most of them adopt simple linear fusion
+of multi-scale features, and neglect their possibly complex relationship and
+interaction. In contrast, humans typically first form a global impression to
+locate important regions and then focus on local details in those regions. We
+therefore propose a top-down approach that uses high-level semantics to guide
+the IQA network to focus on semantically important local distortion regions,
+named as \emph{TOPIQ}. Our approach to IQA involves the design of a heuristic
+coarse-to-fine network (CFANet) that leverages multi-scale features and
+progressively propagates multi-level semantic information to low-level
+representations in a top-down manner. A key component of our approach is the
+proposed cross-scale attention mechanism, which calculates attention maps for
+lower level features guided by higher level features. This mechanism emphasizes
+active semantic regions for low-level distortions, thereby improving
+performance. CFANet can be used for both Full-Reference (FR) and No-Reference
+(NR) IQA. We use ResNet50 as its backbone and demonstrate that CFANet achieves
+better or competitive performance on most public FR and NR benchmarks compared
+with state-of-the-art methods based on vision transformers, while being much
+more efficient (with only ${\sim}13\%$ FLOPS of the current best FR method).
+Codes are released at \url{https://github.com/chaofengc/IQA-PyTorch}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures, 10 tables. In submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-based Photo Color Adjustment for Graphic Designs <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenwei Wang, Nanxuan Zhao, Gerhard Hancke, Rynson W. H. Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adjusting the photo color to associate with some design elements is an
+essential way for a graphic design to effectively deliver its message and make
+it aesthetically pleasing. However, existing tools and previous works face a
+dilemma between the ease of use and level of expressiveness. To this end, we
+introduce an interactive language-based approach for photo recoloring, which
+provides an intuitive system that can assist both experts and novices on
+graphic design. Given a graphic design containing a photo that needs to be
+recolored, our model can predict the source colors and the target regions, and
+then recolor the target regions with the source colors based on the given
+language-based instruction. The multi-granularity of the instruction allows
+diverse user intentions. The proposed novel task faces several unique
+challenges, including: 1) color accuracy for recoloring with exactly the same
+color from the target design element as specified by the user; 2)
+multi-granularity instructions for parsing instructions correctly to generate a
+specific result or multiple plausible ones; and 3) locality for recoloring in
+semantically meaningful local regions to preserve original image semantics. To
+address these challenges, we propose a model called LangRecol with two main
+components: the language-based source color prediction module and the
+semantic-palette-based photo recoloring module. We also introduce an approach
+for generating a synthetic graphic design dataset with instructions to enable
+model training. We evaluate our model via extensive experiments and user
+studies. We also discuss several practical applications, showing the
+effectiveness and practicality of our approach. Code and data for this paper
+are at: https://zhenwwang.github.io/langrecol.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 19 figures. Accepted by SIGGRAPH 2023. Project page:
+  https://zhenwwang.github.io/langrecol</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-scale Alternated Attention <span class="highlight-title">Transformer</span> for Generalized Stereo
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Miao, Hong Zhao, Tongjia Chen, Wei Huang, Changyan Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent stereo matching networks achieves dramatic performance by introducing
+epipolar line constraint to limit the matching range of dual-view. However, in
+complicated real-world scenarios, the feature information based on
+intra-epipolar line alone is too weak to facilitate stereo matching. In this
+paper, we present a simple but highly effective network called Alternated
+Attention U-shaped Transformer (AAUformer) to balance the impact of epipolar
+line in dual and single view respectively for excellent generalization
+performance. Compared to other models, our model has several main designs: 1)
+to better liberate the local semantic features of the single-view at pixel
+level, we introduce window self-attention to break the limits of intra-row
+self-attention and completely replace the convolutional network for denser
+features before cross-matching; 2) the multi-scale alternated attention
+backbone network was designed to extract invariant features in order to
+achieves the coarse-to-fine matching process for hard-to-discriminate regions.
+We performed a series of both comparative studies and ablation studies on
+several mainstream stereo matching datasets. The results demonstrate that our
+model achieves state-of-the-art on the Scene Flow dataset, and the fine-tuning
+performance is competitive on the KITTI 2015 dataset. In addition, for cross
+generalization experiments on synthetic and real-world datasets, our model
+outperforms several state-of-the-art works.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prototypes-oriented Transductive Few-shot Learning with Conditional
+  Transport <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Tian, Jingyi Feng, Wenchao Chen, Xiaoqiang Chai, Liming Wang, Xiyang Liu, Bo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transductive Few-Shot Learning (TFSL) has recently attracted increasing
+attention since it typically outperforms its inductive peer by leveraging
+statistics of query samples. However, previous TFSL methods usually encode
+uniform prior that all the classes within query samples are equally likely,
+which is biased in imbalanced TFSL and causes severe performance degradation.
+  Given this pivotal issue, in this work, we propose a novel Conditional
+Transport (CT) based imbalanced TFSL model called {\textbf P}rototypes-oriented
+{\textbf U}nbiased {\textbf T}ransfer {\textbf M}odel (PUTM) to fully exploit
+unbiased statistics of imbalanced query samples, which employs forward and
+backward navigators as transport matrices to balance the prior of query samples
+per class between uniform and adaptive data-driven distributions. For
+efficiently transferring statistics learned by CT, we further derive a closed
+form solution to refine prototypes based on MAP given the learned navigators.
+The above two steps of discovering and transferring unbiased statistics follow
+an iterative manner, formulating our EM-based solver.
+  Experimental results on four standard benchmarks including miniImageNet,
+tieredImageNet, CUB, and CIFAR-FS demonstrate superiority of our model in
+class-imbalanced generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Fine-Grained Features for Pixel-wise Video Correspondences <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Li, Shenglong Zhou, Dong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video analysis tasks rely heavily on identifying the pixels from different
+frames that correspond to the same visual target. To tackle this problem,
+recent studies have advocated feature learning methods that aim to learn
+distinctive representations to match the pixels, especially in a
+self-supervised fashion. Unfortunately, these methods have difficulties for
+tiny or even single-pixel visual targets. Pixel-wise video correspondences were
+traditionally related to optical flows, which however lead to deterministic
+correspondences and lack robustness on real-world videos. We address the
+problem of learning features for establishing pixel-wise correspondences.
+Motivated by optical flows as well as the self-supervised feature learning, we
+propose to use not only labeled synthetic videos but also unlabeled real-world
+videos for learning fine-grained representations in a holistic framework. We
+adopt an adversarial learning scheme to enhance the generalization ability of
+the learned features. Moreover, we design a coarse-to-fine framework to pursue
+high computational efficiency. Our experimental results on a series of
+correspondence-based tasks demonstrate that the proposed method outperforms
+state-of-the-art rivals in both accuracy and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Code and models are available at
+  https://github.com/qianduoduolr/FGVC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FourLLIE: Boosting Low-Light Image Enhancement by Fourier Frequency
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxi Wang, Hongjun Wu, Zhi Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Fourier frequency information has attracted much attention in
+Low-Light Image Enhancement (LLIE). Some researchers noticed that, in the
+Fourier space, the lightness degradation mainly exists in the amplitude
+component and the rest exists in the phase component. By incorporating both the
+Fourier frequency and the spatial information, these researchers proposed
+remarkable solutions for LLIE. In this work, we further explore the positive
+correlation between the magnitude of amplitude and the magnitude of lightness,
+which can be effectively leveraged to improve the lightness of low-light images
+in the Fourier space. Moreover, we find that the Fourier transform can extract
+the global information of the image, and does not introduce massive neural
+network parameters like Multi-Layer Perceptrons (MLPs) or Transformer. To this
+end, a two-stage Fourier-based LLIE network (FourLLIE) is proposed. In the
+first stage, we improve the lightness of low-light images by estimating the
+amplitude transform map in the Fourier space. In the second stage, we introduce
+the Signal-to-Noise-Ratio (SNR) map to provide the prior for integrating the
+global Fourier frequency and the local spatial information, which recovers
+image details in the spatial space. With this ingenious design, FourLLIE
+outperforms the existing state-of-the-art (SOTA) LLIE methods on four
+representative datasets while maintaining good model efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MatSpectNet: Material Segmentation Network with Domain-Aware and
+  Physically-Constrained Hyperspectral Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11466v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11466v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwen Heng, Yihong Wu, Jiawen Chen, Srinandan Dasmahapatra, Hansung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving accurate material segmentation for 3-channel RGB images is
+challenging due to the considerable variation in a material's appearance.
+Hyperspectral images, which are sets of spectral measurements sampled at
+multiple wavelengths, theoretically offer distinct information for material
+identification, as variations in intensity of electromagnetic radiation
+reflected by a surface depend on the material composition of a scene. However,
+existing hyperspectral datasets are impoverished regarding the number of images
+and material categories for the dense material segmentation task, and
+collecting and annotating hyperspectral images with a spectral camera is
+prohibitively expensive. To address this, we propose a new model, the
+MatSpectNet to segment materials with recovered hyperspectral images from RGB
+images. The network leverages the principles of colour perception in modern
+cameras to constrain the reconstructed hyperspectral images and employs the
+domain adaptation method to generalise the hyperspectral reconstruction
+capability from a spectral recovery dataset to material segmentation datasets.
+The reconstructed hyperspectral images are further filtered using learned
+response curves and enhanced with human perception. The performance of
+MatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces
+dataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase
+in average pixel accuracy and a 3.42% improvement in mean class accuracy
+compared with the most recent publication. The project code is attached to the
+supplementary material and will be published on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages main paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal EEG Representation Learning on Riemannian Manifold and
+  Euclidean Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2008.08633v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2008.08633v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyi Zhang, Ali Etemad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel deep neural architecture for learning electroencephalogram
+(EEG). To learn the spatial information, our model first obtains the Riemannian
+mean and distance from spatial covariance matrices (SCMs) on a Riemannian
+manifold. We then project the spatial information onto a Euclidean space via
+tangent space learning. Following, two fully connected layers are used to learn
+the spatial information embeddings. Moreover, our proposed method learns the
+temporal information via differential entropy and logarithm power spectrum
+density features extracted from EEG signals in a Euclidean space using a deep
+long short-term memory network with a soft attention mechanism. To combine the
+spatial and temporal information, we use an effective fusion strategy, which
+learns attention weights applied to embedding-specific features for decision
+making. We evaluate our proposed framework on four public datasets across three
+popular EEG-related tasks, notably emotion recognition, vigilance estimation,
+and motor imagery classification, containing various types of tasks such as
+binary classification, multi-class classification, and regression. Our proposed
+architecture outperforms other methods on SEED-VIG, and approaches the
+state-of-the-art on the other three datasets (SEED, BCI-IV 2A, and BCI-IV 2B),
+showing the robustness of our framework in EEG representation learning. The
+source code of our paper is publicly available at
+https://github.com/guangyizhangbci/EEG_Riemannian.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Emerging Topics in Computational
+  Intelligence. 15 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Open-Vocabulary Video Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Wang, Cilin Yan, Shuai Wang, Xiaolong Jiang, XU Tang, Yao Hu, Weidi Xie, Efstratios Gavves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video Instance Segmentation (VIS) aims at segmenting and categorizing objects
+in videos from a closed set of training categories, lacking the generalization
+ability to handle novel categories in real-world videos. To address this
+limitation, we make the following three contributions. First, we introduce the
+novel task of Open-Vocabulary Video Instance Segmentation, which aims to
+simultaneously segment, track, and classify objects in videos from open-set
+categories, including novel categories unseen during training. Second, to
+benchmark Open-Vocabulary VIS, we collect a Large-Vocabulary Video Instance
+Segmentation dataset (LV-VIS), that contains well-annotated objects from 1,196
+diverse categories, significantly surpassing the category size of existing
+datasets by more than one order of magnitude. Third, we propose an efficient
+Memory-Induced Transformer architecture, OV2Seg, to first achieve
+Open-Vocabulary VIS in an end-to-end manner with near real-time inference
+speed. Extensive experiments on LV-VIS and four existing VIS datasets
+demonstrate the strong zero-shot generalization ability of OV2Seg on novel
+categories. The dataset and code are released here
+https://github.com/haochenheheda/LVVIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ASSET: Robust Backdoor Data Detection Across a Multiplicity of Deep
+  Learning Paradigms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minzhou Pan, Yi Zeng, Lingjuan Lyu, Xue Lin, Ruoxi Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor data detection is traditionally studied in an end-to-end supervised
+learning (SL) setting. However, recent years have seen the proliferating
+adoption of self-supervised learning (SSL) and transfer learning (TL), due to
+their lesser need for labeled data. Successful backdoor attacks have also been
+demonstrated in these new settings. However, we lack a thorough understanding
+of the applicability of existing detection methods across a variety of learning
+settings. By evaluating 56 attack settings, we show that the performance of
+most existing detection methods varies significantly across different attacks
+and poison ratios, and all fail on the state-of-the-art clean-label attack. In
+addition, they either become inapplicable or suffer large performance losses
+when applied to SSL and TL. We propose a new detection method called Active
+Separation via Offset (ASSET), which actively induces different model behaviors
+between the backdoor and clean samples to promote their separation. We also
+provide procedures to adaptively select the number of suspicious points to
+remove. In the end-to-end SL setting, ASSET is superior to existing methods in
+terms of consistency of defensive performance across different attacks and
+robustness to changes in poison ratios; in particular, it is the only method
+that can detect the state-of-the-art clean-label attack. Moreover, ASSET's
+average detection rates are higher than the best existing methods in SSL and
+TL, respectively, by 69.3% and 33.2%, thus providing the first practical
+backdoor defense for these new DL settings. We open-source the project to drive
+further development and encourage engagement:
+https://github.com/ruoxi-jia-group/ASSET.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, with 13 pages of main text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NightHazeFormer: Single Nighttime Haze Removal Using Prior Query
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Liu, Zhongsheng Yan, Sixiang Chen, Tian Ye, Wenqi Ren, Erkang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nighttime image dehazing is a challenging task due to the presence of
+multiple types of adverse degrading effects including glow, haze, blurry,
+noise, color distortion, and so on. However, most previous studies mainly focus
+on daytime image dehazing or partial degradations presented in nighttime hazy
+scenes, which may lead to unsatisfactory restoration results. In this paper, we
+propose an end-to-end transformer-based framework for nighttime haze removal,
+called NightHazeFormer. Our proposed approach consists of two stages:
+supervised pre-training and semi-supervised fine-tuning. During the
+pre-training stage, we introduce two powerful priors into the transformer
+decoder to generate the non-learnable prior queries, which guide the model to
+extract specific degradations. For the fine-tuning, we combine the generated
+pseudo ground truths with input real-world nighttime hazy images as paired
+images and feed into the synthetic domain to fine-tune the pre-trained model.
+This semi-supervised fine-tuning paradigm helps improve the generalization to
+real domain. In addition, we also propose a large-scale synthetic dataset
+called UNREAL-NH, to simulate the real-world nighttime haze scenarios
+comprehensively. Extensive experiments on several synthetic and real-world
+datasets demonstrate the superiority of our NightHazeFormer over
+state-of-the-art nighttime haze removal methods in terms of both visually and
+quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Invisible Image Watermarks Are Provably Removable Using Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuandong Zhao, Kexun Zhang, Zihao Su, Saastha Vasan, Ilya Grishchenko, Christopher Kruegel, Giovanni Vigna, Yu-Xiang Wang, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Invisible watermarks safeguard images' copyright by embedding hidden messages
+only detectable by owners. They also prevent people from misusing images,
+especially those generated by AI models. We propose a family of regeneration
+attacks to remove these invisible watermarks. The proposed attack method first
+adds random noise to an image to destroy the watermark and then reconstructs
+the image. This approach is flexible and can be instantiated with many existing
+image-denoising algorithms and pre-trained generative models such as diffusion
+models. Through formal proofs and empirical results, we show that all invisible
+watermarks are vulnerable to the proposed attack. For a particularly resilient
+watermark, RivaGAN, regeneration attacks remove 93-99% of the invisible
+watermarks while the baseline attacks remove no more than 3%. However, if we do
+not require the watermarked image to look the same as the original one,
+watermarks that keep the image semantically similar can be an alternative
+defense against our attack. Our finding underscores the need for a shift in
+research/industry emphasis from invisible watermarks to semantically similar
+ones. Code is available at https://github.com/XuandongZhao/WatermarkAttacker.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAIST: Learning Risk Aware Traffic Interactions via Spatio-Temporal
+  Graph Convolutional Networks <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.08722v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.08722v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Videsh Suman, Phu Pham, Aniket Bera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key aspect of driving a road vehicle is to interact with other road users,
+assess their intentions and make risk-aware tactical decisions. An intuitive
+approach to enabling an intelligent automated driving system would be
+incorporating some aspects of human driving behavior. To this end, we propose a
+novel driving framework for egocentric views based on spatio-temporal traffic
+graphs. The traffic graphs model not only the spatial interactions amongst the
+road users but also their individual intentions through temporally associated
+message passing. We leverage a spatio-temporal graph convolutional network
+(ST-GCN) to train the graph edges. These edges are formulated using
+parameterized functions of 3D positions and scene-aware appearance features of
+road agents. Along with tactical behavior prediction, it is crucial to evaluate
+the risk-assessing ability of the proposed framework. We claim that our
+framework learns risk-aware representations by improving on the task of risk
+object identification, especially in identifying objects with vulnerable
+interactions like pedestrians and cyclists.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in 2023 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision-Based Estimation of Small Body Rotational State during the
+  Approach Phase 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Panicucci, Jérémy Lebreton, Roland Brochard, Emmanuel Zenou, Michel Delpech
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The heterogeneity of the small body population complicates the prediction of
+small body properties before the spacecraft's arrival. In the context of
+autonomous small body exploration, it is crucial to develop algorithms that
+estimate the small body characteristics before orbit insertion and close
+proximity operations. This paper develops a vision-based estimation of the
+small-body rotational state (i.e., the center of rotation and rotation axis
+direction) during the approach phase. In this mission phase, the spacecraft
+observes the rotating celestial body and tracks features in images. As feature
+tracks are the projection of the landmarks' circular movement, the possible
+rotation axes are computed. Then, the rotation axis solution is chosen among
+the possible candidates by exploiting feature motion and a heuristic approach.
+Finally, the center of rotation is estimated from the center of brightness. The
+algorithm is tested on more than 800 test cases with two different asteroids
+(i.e., Bennu and Itokawa), three different lighting conditions, and more than
+100 different rotation axis orientations. Each test case is composed of about
+250 synthetic images of the asteroid which are used to track features and
+determine the rotational state. Results show that the error between the true
+rotation axis and its estimation is below $10^{\circ}$ for $80\%$ of the
+considered test cases, implying that the proposed algorithm is a suitable
+method for autonomous small body characterization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human-M3: A Multi-view Multi-modal <span class="highlight-title">Dataset</span> for 3D Human Pose Estimation
+  in Outdoor Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Fan, Siqi Wang, Wenxuan Guo, Wenzhao Zheng, Jianjiang Feng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation in outdoor environments has garnered increasing
+attention recently. However, prevalent 3D human pose datasets pertaining to
+outdoor scenes lack diversity, as they predominantly utilize only one type of
+modality (RGB image or pointcloud), and often feature only one individual
+within each scene. This limited scope of dataset infrastructure considerably
+hinders the variability of available data. In this article, we propose
+Human-M3, an outdoor multi-modal multi-view multi-person human pose database
+which includes not only multi-view RGB videos of outdoor scenes but also
+corresponding pointclouds. In order to obtain accurate human poses, we propose
+an algorithm based on multi-modal data input to generate ground truth
+annotation. This benefits from robust pointcloud detection and tracking, which
+solves the problem of inaccurate human localization and matching ambiguity that
+may exist in previous multi-view RGB videos in outdoor multi-person scenes, and
+generates reliable ground truth annotations. Evaluation of multiple different
+modalities algorithms has shown that this database is challenging and suitable
+for future research. Furthermore, we propose a 3D human pose estimation
+algorithm based on multi-modal data input, which demonstrates the advantages of
+multi-modal data input for 3D human pose estimation. Code and data will be
+released on https://github.com/soullessrobot/Human-M3-Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data will be released on
+  https://github.com/soullessrobot/Human-M3-Dataset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pretrain</span>ing is All You Need: A Multi-Atlas Enhanced <span class="highlight-title">Transformer</span>
+  Framework for Autism Spectrum Disorder Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Mahler, Qi Wang, Julius Steiglechner, Florian Birk, Samuel Heczko, Klaus Scheffler, Gabriele Lohmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is a prevalent psychiatric condition
+characterized by atypical cognitive, emotional, and social patterns. Timely and
+accurate diagnosis is crucial for effective interventions and improved outcomes
+in individuals with ASD. In this study, we propose a novel Multi-Atlas Enhanced
+Transformer framework, METAFormer, ASD classification. Our framework utilizes
+resting-state functional magnetic resonance imaging data from the ABIDE I
+dataset, comprising 406 ASD and 476 typical control (TC) subjects. METAFormer
+employs a multi-atlas approach, where flattened connectivity matrices from the
+AAL, CC200, and DOS160 atlases serve as input to the transformer encoder.
+Notably, we demonstrate that self-supervised pretraining, involving the
+reconstruction of masked values from the input, significantly enhances
+classification performance without the need for additional or separate training
+data. Through stratified cross-validation, we evaluate the proposed framework
+and show that it surpasses state-of-the-art performance on the ABIDE I dataset,
+with an average accuracy of 83.7% and an AUC-score of 0.832. The code for our
+framework is available at https://github.com/Lugges991/METAFormer
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Regularized Mask Tuning: Uncovering Hidden Knowledge in <span class="highlight-title">Pre-train</span>ed
+  Vision-Language Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15049v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15049v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kecheng Zheng, Wei Wu, Ruili Feng, Kai Zhu, Jiawei Liu, Deli Zhao, Zheng-Jun Zha, Wei Chen, Yujun Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning and adapter tuning have shown great potential in transferring
+pre-trained vision-language models (VLMs) to various downstream tasks. In this
+work, we design a new type of tuning method, termed as regularized mask tuning,
+which masks the network parameters through a learnable selection. Inspired by
+neural pathways, we argue that the knowledge required by a downstream task
+already exists in the pre-trained weights but just gets concealed in the
+upstream pre-training stage. To bring the useful knowledge back into light, we
+first identify a set of parameters that are important to a given downstream
+task, then attach a binary mask to each parameter, and finally optimize these
+masks on the downstream data with the parameters frozen. When updating the
+mask, we introduce a novel gradient dropout strategy to regularize the
+parameter selection, in order to prevent the model from forgetting old
+knowledge and overfitting the downstream data. Experimental results on 11
+datasets demonstrate the consistent superiority of our method over previous
+alternatives. It is noteworthy that we manage to deliver 18.73% performance
+improvement compared to the zero-shot CLIP via masking an average of only 2.56%
+parameters. Furthermore, our method is synergistic with most existing
+parameter-efficient tuning methods and can boost the performance on top of
+them. Project page can be found here (https://wuw2019.github.io/R-AMT/).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Applying Plain <span class="highlight-title">Transformer</span>s to Real-World Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00086v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00086v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanxiao Li, Michael Heizmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To apply transformer-based models to point cloud understanding, many previous
+works modify the architecture of transformers by using, e.g., local attention
+and down-sampling. Although they have achieved promising results, earlier works
+on transformers for point clouds have two issues. First, the power of plain
+transformers is still under-explored. Second, they focus on simple and small
+point clouds instead of complex real-world ones. This work revisits the plain
+transformers in real-world point cloud understanding. We first take a closer
+look at some fundamental components of plain transformers, e.g., patchifier and
+positional embedding, for both efficiency and performance. To close the
+performance gap due to the lack of inductive bias and annotated data, we
+investigate self-supervised pre-training with masked autoencoder (MAE).
+Specifically, we propose drop patch, which prevents information leakage and
+significantly improves the effectiveness of MAE. Our models achieve SOTA
+results in semantic segmentation on the S3DIS dataset and object detection on
+the ScanNet dataset with lower computational costs. Our work provides a new
+baseline for future research on transformers for point clouds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Light-VQA: A Multi-Dimensional Quality Assessment Model for Low-Light
+  Video Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunlong Dong, Xiaohong Liu, Yixuan Gao, Xunchu Zhou, Tao Tan, Guangtao Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Users Generated Content (UGC) videos becomes ubiquitous in our
+daily lives. However, due to the limitations of photographic equipments and
+techniques, UGC videos often contain various degradations, in which one of the
+most visually unfavorable effects is the underexposure. Therefore,
+corresponding video enhancement algorithms such as Low-Light Video Enhancement
+(LLVE) have been proposed to deal with the specific degradation. However,
+different from video enhancement algorithms, almost all existing Video Quality
+Assessment (VQA) models are built generally rather than specifically, which
+measure the quality of a video from a comprehensive perspective. To the best of
+our knowledge, there is no VQA model specially designed for videos enhanced by
+LLVE algorithms. To this end, we first construct a Low-Light Video Enhancement
+Quality Assessment (LLVE-QA) dataset in which 254 original low-light videos are
+collected and then enhanced by leveraging 8 LLVE algorithms to obtain 2,060
+videos in total. Moreover, we propose a quality assessment model specialized in
+LLVE, named Light-VQA. More concretely, since the brightness and noise have the
+most impact on low-light enhanced VQA, we handcraft corresponding features and
+integrate them with deep-learning-based semantic features as the overall
+spatial information. As for temporal information, in addition to
+deep-learning-based motion features, we also investigate the handcrafted
+brightness consistency among video frames, and the overall temporal information
+is their concatenation. Subsequently, spatial and temporal information is fused
+to obtain the quality-aware representation of a video. Extensive experimental
+results show that our Light-VQA achieves the best performance against the
+current State-Of-The-Art (SOTA) on LLVE-QA and public dataset. Dataset and
+Codes can be found at https://github.com/wenzhouyidu/Light-VQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized 3D Scene Generation for Generalizable <span class="highlight-title">Self-Supervised</span>
+  <span class="highlight-title">Pre-Train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanxiao Li, Michael Heizmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing and labeling real-world 3D data is laborious and time-consuming,
+which makes it costly to train strong 3D models. To address this issue, recent
+works present a simple method by generating randomized 3D scenes without
+simulation and rendering. Although models pre-trained on the generated
+synthetic data gain impressive performance boosts, previous works have two
+major shortcomings. First, they focus on only one downstream task (i.e., object
+detection), and the generalization to other tasks is unexplored. Second, the
+contributions of generated data are not systematically studied. To obtain a
+deeper understanding of the randomized 3D scene generation technique, we
+revisit previous works and compare different data generation methods using a
+unified setup. Moreover, to clarify the generalization of the pre-trained
+models, we evaluate their performance in multiple tasks (i.e., object detection
+and semantic segmentation) and with different pre-training methods (i.e.,
+masked autoencoder and contrastive learning). Moreover, we propose a new method
+to generate 3D scenes with spherical harmonics. It surpasses the previous
+formula-driven method with a clear margin and achieves on-par results with
+methods using real-world scans and CAD models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapter Learning in <span class="highlight-title">Pretrain</span>ed Feature Extractor for Continual Learning
+  of Diseases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhang, Yujun Huang, Tong Zhang, Qingsong Zou, Wei-Shi Zheng, Ruixuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently intelligent diagnosis systems lack the ability of continually
+learning to diagnose new diseases once deployed, under the condition of
+preserving old disease knowledge. In particular, updating an intelligent
+diagnosis system with training data of new diseases would cause catastrophic
+forgetting of old disease knowledge. To address the catastrophic forgetting
+issue, an Adapter-based Continual Learning framework called ACL is proposed to
+help effectively learn a set of new diseases at each round (or task) of
+continual learning, without changing the shared feature extractor. The
+learnable lightweight task-specific adapter(s) can be flexibly designed (e.g.,
+two convolutional layers) and then added to the pretrained and fixed feature
+extractor. Together with a specially designed task-specific head which absorbs
+all previously learned old diseases as a single "out-of-distribution" category,
+task-specific adapter(s) can help the pretrained feature extractor more
+effectively extract discriminative features between diseases. In addition, a
+simple yet effective fine-tuning is applied to collaboratively fine-tune
+multiple task-specific heads such that outputs from different heads are
+comparable and consequently the appropriate classifier head can be more
+accurately selected during model inference. Extensive empirical evaluations on
+three image datasets demonstrate the superior performance of ACL in continual
+learning of new diseases. The source code is available at
+https://github.com/GiantJun/CL_Pytorch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Vision-Language <span class="highlight-title">Pre-Train</span>ing with Jointly Learned Questioner
+  and Dense Captioner <span class="chip">ACM MM '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11769v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11769v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikang Liu, Sihan Chen, Longteng Guo, Handong Li, Xingjian He, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained multimodal models have demonstrated significant success in
+a range of downstream tasks, including image captioning, image-text retrieval,
+visual question answering (VQA), etc. However, many of these methods rely on
+image-text pairs collected from the web as pre-training data and unfortunately
+overlook the need for fine-grained feature alignment between vision and
+language modalities, which requires detailed understanding of images and
+language expressions. While integrating VQA and dense captioning (DC) into
+pre-training can address this issue, acquiring image-question-answer as well as
+image-location-caption triplets is challenging and time-consuming.
+Additionally, publicly available datasets for VQA and dense captioning are
+typically limited in scale due to manual data collection and labeling efforts.
+In this paper, we propose a novel method called Joint QA and DC GEneration
+(JADE), which utilizes a pre-trained multimodal model and easily-crawled
+image-text pairs to automatically generate and filter large-scale VQA and dense
+captioning datasets. We apply this method to the Conceptual Caption (CC3M)
+dataset to generate a new dataset called CC3M-QA-DC. Experiments show that when
+used for pre-training in a multi-task manner, CC3M-QA-DC can improve the
+performance with various backbones on various downstream tasks. Furthermore,
+our generated CC3M-QA-DC can be combined with larger image-text datasets (e.g.,
+CC15M) and achieve competitive results compared with models using much more
+data. Code and dataset are available at
+https://github.com/johncaged/OPT_Questioner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. Accepted by ACM MM '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modeling T1 Resting-State MRI Variants Using Convolutional Neural
+  Networks in Diagnosis of OCD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tarun Eswar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obsessive-compulsive disorder (OCD) presents itself as a highly debilitating
+disorder. The disorder has common associations with the prefrontal cortex and
+the glutamate receptor known as Metabotropic Glutamate Receptor 5 (mGluR5).
+This receptor has been observed to demonstrate higher levels of signaling from
+positron emission tomography scans measured by its distribution volume ratios
+in mice. Despite this evidence, studies are unable to fully verify the
+involvement of mGluR5 as more empirical data is needed. Computational modeling
+methods were used as a means of validation for previous hypotheses involving
+mGluR5. The inadequacies in relation to the causal factor of OCD were answered
+by utilizing T1 resting-state magnetic resonance imaging (TRS-MRI) scans of
+patients suffering from schizophrenia, major depressive disorder, and
+obsessive-compulsive disorder. Because comorbid cases often occur within these
+disorders, cross-comparative abilities become necessary to find distinctive
+characteristics. Two-dimensional convolutional neural networks alongside
+ResNet50 and MobileNet models were constructed and evaluated for efficiency.
+Activation heatmaps of TRS-MRI scans were outputted, allowing for
+transcriptomics analysis. Though, a lack of ability to predict OCD cases
+prevented gene expression analysis. Across all models, there was an 88.75%
+validation accuracy for MDD, and 82.08% validation accuracy for SZD under the
+framework of ResNet50 as well as novel computation. OCD yielded an accuracy
+rate of around 54.4%. These results provided further evidence for the p-factor
+theory regarding mental disorders. Future work involves the application of
+alternate transfer learning networks than those used in this paper to bolster
+accuracy rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving
+  Camera Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09887v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09887v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathias Parger, Chengcheng Tang, Thomas Neff, Christopher D. Twigg, Cem Keskin, Robert Wang, Markus Steinberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural network inference on video input is computationally
+expensive and requires high memory bandwidth. Recently, DeltaCNN managed to
+reduce the cost by only processing pixels with significant updates over the
+previous frame. However, DeltaCNN relies on static camera input. Moving cameras
+add new challenges in how to fuse newly unveiled image regions with already
+processed regions efficiently to minimize the update rate - without increasing
+memory overhead and without knowing the camera extrinsics of future frames. In
+this work, we propose MotionDeltaCNN, a sparse CNN inference framework that
+supports moving cameras. We introduce spherical buffers and padded convolutions
+to enable seamless fusion of newly unveiled regions and previously processed
+regions -- without increasing memory footprint. Our evaluation shows that we
+outperform DeltaCNN by up to 90% for moving camera videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SRRT: Search Region Regulation Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.04438v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.04438v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawen Zhu, Xin Chen, Pengyu Zhang, Xinying Wang, Dong Wang, Wenda Zhao, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dominant trackers generate a fixed-size rectangular region based on the
+previous prediction or initial bounding box as the model input, i.e., search
+region. While this manner obtains promising tracking efficiency, a fixed-size
+search region lacks flexibility and is likely to fail in some cases, e.g., fast
+motion and distractor interference. Trackers tend to lose the target object due
+to the limited search region or be interfered with by distractors due to the
+excessive search region. Drawing inspiration from the pattern humans track an
+object, we propose a novel tracking paradigm, called Search Region Regulation
+Tracking (SRRT) that applies a small eyereach when the target is captured and
+zooms out the search field when the target is about to be lost. SRRT applies a
+proposed search region regulator to estimate an optimal search region
+dynamically for each frame, by which the tracker can flexibly respond to
+transient changes in the location of object occurrences. To adapt the object's
+appearance variation during online tracking, we further propose a lockingstate
+determined updating strategy for reference frame updating. The proposed SRRT is
+concise without bells and whistles, yet achieves evident improvements and
+competitive results with other state-of-the-art trackers on eight benchmarks.
+On the large-scale LaSOT benchmark, SRRT improves SiamRPN++ and TransT with
+absolute gains of 4.6% and 3.1% in terms of AUC. The code and models will be
+released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EC-SfM: Efficient Covisibility-based Structure-from-Motion for Both
+  Sequential and Unordered Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10544v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10544v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhichao Ye, Chong Bao, Xin Zhou, Haomin Liu, Hujun Bao, Guofeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structure-from-Motion is a technology used to obtain scene structure through
+image collection, which is a fundamental problem in computer vision. For
+unordered Internet images, SfM is very slow due to the lack of prior knowledge
+about image overlap. For sequential images, knowing the large overlap between
+adjacent frames, SfM can adopt a variety of acceleration strategies, which are
+only applicable to sequential data. To further improve the reconstruction
+efficiency and break the gap of strategies between these two kinds of data,
+this paper presents an efficient covisibility-based incremental SfM. Different
+from previous methods, we exploit covisibility and registration dependency to
+describe the image connection which is suitable to any kind of data. Based on
+this general image connection, we propose a unified framework to efficiently
+reconstruct sequential images, unordered images, and the mixture of these two.
+Experiments on the unordered images and mixed data verify the effectiveness of
+the proposed method, which is three times faster than the state of the art on
+feature matching, and an order of magnitude faster on reconstruction without
+sacrificing the accuracy. The source code is publicly available at
+https://github.com/openxrlab/xrsfm
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted 27 May 2023 (TCSVT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Inconsistent Knowledge Distillation for Object Detection with
+  Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liang, Siyuan Liang, Aishan Liu, Ke Ma, Jingzhi Li, Xiaochun Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation (KD) for object detection aims to train a compact
+detector by transferring knowledge from a teacher model. Since the teacher
+model perceives data in a way different from humans, existing KD methods only
+distill knowledge that is consistent with labels annotated by human expert
+while neglecting knowledge that is not consistent with human perception, which
+results in insufficient distillation and sub-optimal performance. In this
+paper, we propose inconsistent knowledge distillation (IKD), which aims to
+distill knowledge inherent in the teacher model's counter-intuitive
+perceptions. We start by considering the teacher model's counter-intuitive
+perceptions of frequency and non-robust features. Unlike previous works that
+exploit fine-grained features or introduce additional regularizations, we
+extract inconsistent knowledge by providing diverse input using data
+augmentation. Specifically, we propose a sample-specific data augmentation to
+transfer the teacher model's ability in capturing distinct frequency components
+and suggest an adversarial feature augmentation to extract the teacher model's
+perceptions of non-robust features in the data. Extensive experiments
+demonstrate the effectiveness of our method which outperforms state-of-the-art
+KD baselines on one-stage, two-stage and anchor-free object detectors (at most
++1.0 mAP). Our codes will be made available at
+\url{https://github.com/JWLiang007/IKD.git}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MapNeRF: Incorporating Map Priors into Neural Radiance Fields for
+  Driving View Simulation <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenming Wu, Jiadai Sun, Zhelun Shen, Liangjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating camera sensors is a crucial task in autonomous driving. Although
+neural radiance fields are exceptional at synthesizing photorealistic views in
+driving simulations, they still fail to generate extrapolated views. This paper
+proposes to incorporate map priors into neural radiance fields to synthesize
+out-of-trajectory driving views with semantic road consistency. The key insight
+is that map information can be utilized as a prior to guiding the training of
+the radiance fields with uncertainty. Specifically, we utilize the coarse
+ground surface as uncertain information to supervise the density field and warp
+depth with uncertainty from unknown camera poses to ensure multi-view
+consistency. Experimental results demonstrate that our approach can produce
+semantic consistency in deviated views for vehicle camera simulation. The
+supplementary video can be viewed at https://youtu.be/jEQWr-Rfh3A.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/RSJ International Conference on Intelligent Robots
+  and Systems (IROS) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-based Knowledge Augmented Vision Language <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahua Rao, Zifei Shan, Longpo Liu, Yao Zhou, Yuedong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent progress in large-scale vision and language representation
+learning, Vision Language Pre-training (VLP) models have achieved promising
+improvements on various multi-modal downstream tasks. Albeit powerful, these
+models have not fully leveraged world knowledge to their advantage. A key
+challenge of knowledge-augmented VLP is the lack of clear connections between
+knowledge and multi-modal data. Moreover, not all knowledge present in
+images/texts is useful, therefore prior approaches often struggle to
+effectively integrate knowledge, visual, and textual information. In this
+study, we propose REtrieval-based knowledge Augmented Vision Language (REAVL),
+a novel knowledge-augmented pre-training framework to address the above issues.
+For the first time, we introduce a knowledge-aware self-supervised learning
+scheme that efficiently establishes the correspondence between knowledge and
+multi-modal data and identifies informative knowledge to improve the modeling
+of alignment and interactions between visual and textual modalities. By
+adaptively integrating informative knowledge with visual and textual
+information, REAVL achieves new state-of-the-art performance uniformly on
+knowledge-based vision-language understanding and multi-modal entity linking
+tasks, as well as competitive results on general vision-language tasks while
+only using 0.2% pre-training data of the best models. Our model shows strong
+sample efficiency and effective knowledge utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2210.09338 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight Method for Modeling Confidence in Recommendations with
+  Learned Beta Distributions <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Norman Knyazev, Harrie Oosterhuis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most Recommender Systems (RecSys) do not provide an indication of confidence
+in their decisions. Therefore, they do not distinguish between recommendations
+of which they are certain, and those where they are not. Existing confidence
+methods for RecSys are either inaccurate heuristics, conceptually complex or
+computationally very expensive. Consequently, real-world RecSys applications
+rarely adopt these methods, and thus, provide no confidence insights in their
+behavior. In this work, we propose learned beta distributions (LBD) as a simple
+and practical recommendation method with an explicit measure of confidence. Our
+main insight is that beta distributions predict user preferences as probability
+distributions that naturally model confidence on a closed interval, yet can be
+implemented with the minimal model-complexity. Our results show that LBD
+maintains competitive accuracy to existing methods while also having a
+significantly stronger correlation between its accuracy and confidence.
+Furthermore, LBD has higher performance when applied to a high-precision
+targeted recommendation task. Our work thus shows that confidence in RecSys is
+possible without sacrificing simplicity or accuracy, and without introducing
+heavy computational complexity. Thereby, we hope it enables better insight into
+real-world RecSys and opens the door for novel future applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 17th ACM Conference on Recommender Systems
+  (RecSys '23), September 18-22, 2023, Singapore, Singapore. ACM, New York, NY,
+  USA, 12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-Guided Feature Distillation for Multimodal Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Liu, Huilin Chen, Zhiyong Cheng, Liqiang Nie, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal recommendation exploits the rich multimodal information associated
+with users or items to enhance the representation learning for better
+performance. In these methods, end-to-end feature extractors (e.g.,
+shallow/deep neural networks) are often adopted to tailor the generic
+multimodal features that are extracted from raw data by pre-trained models for
+recommendation. However, compact extractors, such as shallow neural networks,
+may find it challenging to extract effective information from complex and
+high-dimensional generic modality features. Conversely, DNN-based extractors
+may encounter the data sparsity problem in recommendation. To address this
+problem, we propose a novel model-agnostic approach called Semantic-guided
+Feature Distillation (SGFD), which employs a teacher-student framework to
+extract feature for multimodal recommendation. The teacher model first extracts
+rich modality features from the generic modality feature by considering both
+the semantic information of items and the complementary information of multiple
+modalities. SGFD then utilizes response-based and feature-based distillation
+loss to effectively transfer the knowledge encoded in the teacher model to the
+student model. To evaluate the effectiveness of our SGFD, we integrate SGFD
+into three backbone multimodal recommendation models. Extensive experiments on
+three public real-world datasets demonstrate that SGFD-enhanced models can
+achieve substantial improvement over their counterparts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Domain-Specific Retrieval by NLI Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Dušek, Aleksander Wawer, Christopher Galias, Lidia Wojciechowska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The aim of this article is to investigate the fine-tuning potential of
+natural language inference (NLI) data to improve information retrieval and
+ranking. We demonstrate this for both English and Polish languages, using data
+from one of the largest Polish e-commerce sites and selected open-domain
+datasets. We employ both monolingual and multilingual sentence encoders
+fine-tuned by a supervised method utilizing contrastive loss and NLI data. Our
+results point to the fact that NLI fine-tuning increases the performance of the
+models in both tasks and both languages, with the potential to improve mono-
+and multilingual models. Finally, we investigate uniformity and alignment of
+the embeddings to explain the effect of NLI-based fine-tuning for an
+out-of-domain use-case.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Coding through Iterative Block Leverage Score Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neophytos Charalambides, Mert Pilanci, Alfred Hero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We generalize the leverage score sampling sketch for $\ell_2$-subspace
+embeddings, to accommodate sampling subsets of the transformed data, so that
+the sketching approach is appropriate for distributed settings. This is then
+used to derive an approximate coded computing approach for first-order methods;
+known as gradient coding, to accelerate linear regression in the presence of
+failures in distributed computational networks, \textit{i.e.} stragglers. We
+replicate the data across the distributed network, to attain the approximation
+guarantees through the induced sampling distribution. The significance and main
+contribution of this work, is that it unifies randomized numerical linear
+algebra with approximate coded computing, while attaining an induced
+$\ell_2$-subspace embedding through uniform sampling. The transition to uniform
+sampling is done without applying a random projection, as in the case of the
+subsampled randomized Hadamard transform. Furthermore, by incorporating this
+technique to coded computing, our scheme is an iterative sketching approach to
+approximately solving linear regression. We also propose weighting when
+sketching takes place through sampling with replacement, for further
+compression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 6 figures, 1 table,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Group Choices from Group Profiles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanif Emamgholizadeh, Amra Delic, Francesco Ricci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group recommender systems (GRS) identify items to recommend to a group by
+aggregating group members' individual preferences into a group profile. The
+preference aggregation strategy used to build the group profile can also be
+used for predicting the item that a group may decide to choose, i.e., by
+assuming that the group is applying exactly that strategy. However, predicting
+the choice of a group is challenging since the RS is not aware of the precise
+preference aggregation strategy that is going to be used by the group. Hence,
+the aim of this paper is to validate the research hypothesis that, by using a
+machine learning approach and a data set of observed group choices, it is
+possible to predict a group's final choice, better than by using a standard
+preference aggregation strategy. Inspired by Social Decision Scheme theory,
+which first tried to address the group choice prediction problem, we search for
+a group profile definition that, in conjunction with a machine learning model,
+can be used to accurately predict a group choice. Moreover, to cope with the
+data scarcity problem, we propose two data augmentation methods, which add
+synthetic group profiles to the training data, and we hypothesise they can
+further improve the choice prediction accuracy. We validate our research
+hypotheses by using a data set containing 282 participants organized in 79
+groups. The experiments indicate that the proposed methods outperform baseline
+aggregation strategies when used for group choice prediction. The proposed
+method is robust with the presence of missing preference data and achieves a
+performance superior to what human can achieve on the group choice prediction
+task. Finally, the proposed data augmentation method can also improve the
+prediction accuracy. Our approach can be exploited in novel GRSs to identify
+the items that the group is likely to choose and help the group to make a
+better choice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decision Knowledge Graphs: Construction of and Usage in Question
+  Answering for Clinical Practice Guidelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vasudhan Varma Kandula, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the medical domain, several disease treatment procedures have been
+documented properly as a set of instructions known as Clinical Practice
+Guidelines (CPGs). CPGs have been developed over the years on the basis of past
+treatments, and are updated frequently. A doctor treating a particular patient
+can use these CPGs to know how past patients with similar conditions were
+treated successfully and can find the recommended treatment procedure. In this
+paper, we present a Decision Knowledge Graph (DKG) representation to store CPGs
+and to perform question-answering on CPGs. CPGs are very complex and no
+existing representation is suitable to perform question-answering and searching
+tasks on CPGs. As a result, doctors and practitioners have to manually wade
+through the guidelines, which is inefficient. Representation of CPGs is
+challenging mainly due to frequent updates on CPGs and decision-based
+structure. Our proposed DKG has a decision dimension added to a Knowledge Graph
+(KG) structure, purported to take care of decision based behavior of CPGs.
+Using this DKG has shown 40\% increase in accuracy compared to fine-tuned
+BioBert model in performing question-answering on CPGs. To the best of our
+knowledge, ours is the first attempt at creating DKGs and using them for
+representing CPGs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pseudo Session-Based Recommendation with Hierarchical Embedding and
+  Session Attributes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10029v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10029v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuta Sumiya, Ryusei Numata, Satoshi Takahashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, electronic commerce (EC) websites have been unable to provide an
+identification number (user ID) for each transaction data entry because of
+privacy issues. Because most recommendation methods assume that all data are
+assigned a user ID, they cannot be applied to the data without user IDs.
+Recently, session-based recommendation (SBR) based on session information,
+which is short-term behavioral information of users, has been studied. A
+general SBR uses only information about the item of interest to make a
+recommendation (e.g., item ID for an EC site). Particularly in the case of EC
+sites, the data recorded include the name of the item being purchased, the
+price of the item, the category hierarchy, and the gender and region of the
+user. In this study, we define a pseudo--session for the purchase history data
+of an EC site without user IDs and session IDs. Finally, we propose an SBR with
+a co-guided heterogeneous hypergraph and globalgraph network plus, called
+CoHHGN+. The results show that our CoHHGN+ can recommend items with higher
+performance than other methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">32</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Linguistics Will Thrive in the 21st Century: A Reply to Piantadosi
+  (2023) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordan Kodner, Sarah Payne, Jeffrey Heinz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a critical assessment of Piantadosi's (2023) claim that "Modern
+language models refute Chomsky's approach to language," focusing on four main
+points. First, despite the impressive performance and utility of large language
+models (LLMs), humans achieve their capacity for language after exposure to
+several orders of magnitude less data. The fact that young children become
+competent, fluent speakers of their native languages with relatively little
+exposure to them is the central mystery of language learning to which Chomsky
+initially drew attention, and LLMs currently show little promise of solving
+this mystery. Second, what can the artificial reveal about the natural? Put
+simply, the implications of LLMs for our understanding of the cognitive
+structures and mechanisms underlying language and its acquisition are like the
+implications of airplanes for understanding how birds fly. Third, LLMs cannot
+constitute scientific theories of language for several reasons, not least of
+which is that scientific theories must provide interpretable explanations, not
+just predictions. This leads to our final point: to even determine whether the
+linguistic and cognitive capabilities of LLMs rival those of humans requires
+explicating what humans' capacities actually are. In other words, it requires a
+separate theory of language and cognition; generative linguistics provides
+precisely such a theory. As such, we conclude that generative linguistics as a
+scientific discipline will remain indispensable throughout the 21st century and
+beyond.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Effect of SGD Batch Size on Autoencoder Learning: Sparsity,
+  Sharpness, and Feature Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Ghosh, Spencer Frei, Wooseok Ha, Bin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the dynamics of stochastic gradient descent
+(SGD) when training a single-neuron autoencoder with linear or ReLU activation
+on orthogonal data. We show that for this non-convex problem, randomly
+initialized SGD with a constant step size successfully finds a global minimum
+for any batch size choice. However, the particular global minimum found depends
+upon the batch size. In the full-batch setting, we show that the solution is
+dense (i.e., not sparse) and is highly aligned with its initialized direction,
+showing that relatively little feature learning occurs. On the other hand, for
+any batch size strictly smaller than the number of samples, SGD finds a global
+minimum which is sparse and nearly orthogonal to its initialization, showing
+that the randomness of stochastic gradients induces a qualitatively different
+type of "feature selection" in this setting. Moreover, if we measure the
+sharpness of the minimum by the trace of the Hessian, the minima found with
+full batch gradient descent are flatter than those found with strictly smaller
+batch sizes, in contrast to previous works which suggest that large batches
+lead to sharper minima. To prove convergence of SGD with a constant step size,
+we introduce a powerful tool from the theory of non-homogeneous random walks
+which may be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Average-Hard Attention <span class="highlight-title">Transformer</span>s are Constant-Depth Uniform Threshold
+  Circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lena Strobl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as a widely used neural network model for various
+natural language processing tasks. Previous research explored their
+relationship with constant-depth threshold circuits, making two assumptions:
+average-hard attention and logarithmic precision for internal computations
+relative to input length. Merrill et al. (2022) prove that average-hard
+attention transformers recognize languages that fall within the complexity
+class TC0, denoting the set of languages that can be recognized by
+constant-depth polynomial-size threshold circuits. Likewise, Merrill and
+Sabharwal (2023) show that log-precision transformers recognize languages
+within the class of uniform TC0. This shows that both transformer models can be
+simulated by constant-depth threshold circuits, with the latter being more
+robust due to generating a uniform circuit family. Our paper shows that the
+first result can be extended to yield uniform circuits as well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time-Parameterized Convolutional Neural Networks for Irregularly Sampled
+  Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chrysoula Kosma, Giannis Nikolentzos, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Irregularly sampled multivariate time series are ubiquitous in several
+application domains, leading to sparse, not fully-observed and non-aligned
+observations across different variables. Standard sequential neural network
+architectures, such as recurrent neural networks (RNNs) and convolutional
+neural networks (CNNs), consider regular spacing between observation times,
+posing significant challenges to irregular time series modeling. While most of
+the proposed architectures incorporate RNN variants to handle irregular time
+intervals, convolutional neural networks have not been adequately studied in
+the irregular sampling setting. In this paper, we parameterize convolutional
+layers by employing time-explicitly initialized kernels. Such general functions
+of time enhance the learning process of continuous-time hidden dynamics and can
+be efficiently incorporated into convolutional kernel weights. We, thus,
+propose the time-parameterized convolutional neural network (TPCNN), which
+shares similar properties with vanilla convolutions but is carefully designed
+for irregularly sampled time series. We evaluate TPCNN on both interpolation
+and classification tasks involving real-world irregularly sampled multivariate
+time series datasets. Our experimental results indicate the competitive
+performance of the proposed TPCNN model which is also significantly more
+efficient than other state-of-the-art methods. At the same time, the proposed
+architecture allows the interpretability of the input series by leveraging the
+combination of learnable time functions that improve the network performance in
+subsequent tasks and expedite the inaugural application of convolutions in this
+field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2101.10318 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Free Distributed GNN Training with Vertex Cut 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaidi Cao, Rui Deng, Shirley Wu, Edward W Huang, Karthik Subbian, Jure Leskovec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training Graph Neural Networks (GNNs) on real-world graphs consisting of
+billions of nodes and edges is quite challenging, primarily due to the
+substantial memory needed to store the graph and its intermediate node and edge
+features, and there is a pressing need to speed up the training process. A
+common approach to achieve speed up is to divide the graph into many smaller
+subgraphs, which are then distributed across multiple GPUs in one or more
+machines and processed in parallel. However, existing distributed methods
+require frequent and substantial cross-GPU communication, leading to
+significant time overhead and progressively diminishing scalability. Here, we
+introduce CoFree-GNN, a novel distributed GNN training framework that
+significantly speeds up the training process by implementing communication-free
+training. The framework utilizes a Vertex Cut partitioning, i.e., rather than
+partitioning the graph by cutting the edges between partitions, the Vertex Cut
+partitions the edges and duplicates the node information to preserve the graph
+structure. Furthermore, the framework maintains high model accuracy by
+incorporating a reweighting mechanism to handle a distorted graph distribution
+that arises from the duplicated nodes. We also propose a modified DropEdge
+technique to further speed up the training process. Using an extensive set of
+experiments on real-world networks, we demonstrate that CoFree-GNN speeds up
+the GNN training process by up to 10 times over the existing state-of-the-art
+GNN training approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Microvasculature Segmentation in Human BioMolecular Atlas Program
+  (HuBMAP) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Sultan, Yongqiang Wang, James Scanlon, Lisa D'lima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image segmentation serves as a critical tool across a range of applications,
+encompassing autonomous driving's pedestrian detection and pre-operative tumor
+delineation in the medical sector. Among these applications, we focus on the
+National Institutes of Health's (NIH) Human BioMolecular Atlas Program
+(HuBMAP), a significant initiative aimed at creating detailed cellular maps of
+the human body. In this study, we concentrate on segmenting various
+microvascular structures in human kidneys, utilizing 2D Periodic Acid-Schiff
+(PAS)-stained histology images. Our methodology begins with a foundational
+FastAI U-Net model, upon which we investigate alternative backbone
+architectures, delve into deeper models, and experiment with Feature Pyramid
+Networks. We rigorously evaluate these varied approaches by benchmarking their
+performance against our baseline U-Net model. This study thus offers a
+comprehensive exploration of cutting-edge segmentation techniques, providing
+valuable insights for future research in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Source-free Domain Adaptive Human Pose Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qucheng Peng, Ce Zheng, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Pose Estimation (HPE) is widely used in various fields, including
+motion analysis, healthcare, and virtual reality. However, the great expenses
+of labeled real-world datasets present a significant challenge for HPE. To
+overcome this, one approach is to train HPE models on synthetic datasets and
+then perform domain adaptation (DA) on real-world data. Unfortunately, existing
+DA methods for HPE neglect data privacy and security by using both source and
+target data in the adaptation process.
+  To this end, we propose a new task, named source-free domain adaptive HPE,
+which aims to address the challenges of cross-domain learning of HPE without
+access to source data during the adaptation process. We further propose a novel
+framework that consists of three models: source model, intermediate model, and
+target model, which explores the task from both source-protect and
+target-relevant perspectives. The source-protect module preserves source
+information more effectively while resisting noise, and the target-relevant
+module reduces the sparsity of spatial representations by building a novel
+spatial probability space, and pose-specific contrastive learning and
+information maximization are proposed on the basis of this space. Comprehensive
+experiments on several domain adaptive HPE benchmarks show that the proposed
+method outperforms existing approaches by a considerable margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Lightweight Method for Modeling Confidence in Recommendations with
+  Learned Beta Distributions <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Norman Knyazev, Harrie Oosterhuis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most Recommender Systems (RecSys) do not provide an indication of confidence
+in their decisions. Therefore, they do not distinguish between recommendations
+of which they are certain, and those where they are not. Existing confidence
+methods for RecSys are either inaccurate heuristics, conceptually complex or
+computationally very expensive. Consequently, real-world RecSys applications
+rarely adopt these methods, and thus, provide no confidence insights in their
+behavior. In this work, we propose learned beta distributions (LBD) as a simple
+and practical recommendation method with an explicit measure of confidence. Our
+main insight is that beta distributions predict user preferences as probability
+distributions that naturally model confidence on a closed interval, yet can be
+implemented with the minimal model-complexity. Our results show that LBD
+maintains competitive accuracy to existing methods while also having a
+significantly stronger correlation between its accuracy and confidence.
+Furthermore, LBD has higher performance when applied to a high-precision
+targeted recommendation task. Our work thus shows that confidence in RecSys is
+possible without sacrificing simplicity or accuracy, and without introducing
+heavy computational complexity. Thereby, we hope it enables better insight into
+real-world RecSys and opens the door for novel future applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 17th ACM Conference on Recommender Systems
+  (RecSys '23), September 18-22, 2023, Singapore, Singapore. ACM, New York, NY,
+  USA, 12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting Machine Learning Diagnostic Models to New Populations Using a
+  Small Amount of Data: Results from Clinical Neuroscience 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongguang Wang, Guray Erus, Pratik Chaudhari, Christos Davatzikos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) has shown great promise for revolutionizing a number of
+areas, including healthcare. However, it is also facing a reproducibility
+crisis, especially in medicine. ML models that are carefully constructed from
+and evaluated on a training set might not generalize well on data from
+different patient populations or acquisition instrument settings and protocols.
+We tackle this problem in the context of neuroimaging of Alzheimer's disease
+(AD), schizophrenia (SZ) and brain aging. We develop a weighted empirical risk
+minimization approach that optimally combines data from a source group, e.g.,
+subjects are stratified by attributes such as sex, age group, race and clinical
+cohort to make predictions on a target group, e.g., other sex, age group, etc.
+using a small fraction (10%) of data from the target group. We apply this
+method to multi-source data of 15,363 individuals from 20 neuroimaging studies
+to build ML models for diagnosis of AD and SZ, and estimation of brain age. We
+found that this approach achieves substantially better accuracy than existing
+domain adaptation techniques: it obtains area under curve greater than 0.95 for
+AD classification, area under curve greater than 0.7 for SZ classification and
+mean absolute error less than 5 years for brain age prediction on all target
+groups, achieving robustness to variations of scanners, protocols, and
+demographic or clinical characteristics. In some cases, it is even better than
+training on all data from the target group, because it leverages the diversity
+and size of a larger training set. We also demonstrate the utility of our
+models for prognostic tasks such as predicting disease progression in
+individuals with mild cognitive impairment. Critically, our brain age
+prediction models lead to new clinical insights regarding correlations with
+neurophysiological tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two Sides of Miscalibration: Identifying Over and Under-Confidence
+  Prediction for Network Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Ao, Stefan Rueger, Advaith Siddharthan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Proper confidence calibration of deep neural networks is essential for
+reliable predictions in safety-critical tasks. Miscalibration can lead to model
+over-confidence and/or under-confidence; i.e., the model's confidence in its
+prediction can be greater or less than the model's accuracy. Recent studies
+have highlighted the over-confidence issue by introducing calibration
+techniques and demonstrated success on various tasks. However, miscalibration
+through under-confidence has not yet to receive much attention. In this paper,
+we address the necessity of paying attention to the under-confidence issue. We
+first introduce a novel metric, a miscalibration score, to identify the overall
+and class-wise calibration status, including being over or under-confident. Our
+proposed metric reveals the pitfalls of existing calibration techniques, where
+they often overly calibrate the model and worsen under-confident predictions.
+Then we utilize the class-wise miscalibration score as a proxy to design a
+calibration technique that can tackle both over and under-confidence. We report
+extensive experiments that show our proposed methods substantially
+outperforming existing calibration techniques. We also validate our proposed
+calibration technique on an automatic failure detection task with a
+risk-coverage curve, reporting that our methods improve failure detection as
+well as trustworthiness of the model. The code are available at
+\url{https://github.com/AoShuang92/miscalibration_TS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection of Anomalies in Multivariate Time Series Using Ensemble
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasios Iliopoulos, John Violos, Christos Diou, Iraklis Varlamis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly Detection in multivariate time series is a major problem in many
+fields. Due to their nature, anomalies sparsely occur in real data, thus making
+the task of anomaly detection a challenging problem for classification
+algorithms to solve. Methods that are based on Deep Neural Networks such as
+LSTM, Autoencoders, Convolutional Autoencoders etc., have shown positive
+results in such imbalanced data. However, the major challenge that algorithms
+face when applied to multivariate time series is that the anomaly can arise
+from a small subset of the feature set. To boost the performance of these base
+models, we propose a feature-bagging technique that considers only a subset of
+features at a time, and we further apply a transformation that is based on
+nested rotation computed from Principal Component Analysis (PCA) to improve the
+effectiveness and generalization of the approach. To further enhance the
+prediction performance, we propose an ensemble technique that combines multiple
+base models toward the final decision. In addition, a semi-supervised approach
+using a Logistic Regressor to combine the base models' outputs is proposed. The
+proposed methodology is applied to the Skoltech Anomaly Benchmark (SKAB)
+dataset, which contains time series data related to the flow of water in a
+closed circuit, and the experimental results show that the proposed ensemble
+technique outperforms the basic algorithms. More specifically, the performance
+improvement in terms of anomaly detection accuracy reaches 2% for the
+unsupervised and at least 10% for the semi-supervised models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the 2023 IEEE Big Data Service conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FireFly A Synthetic <span class="highlight-title">Dataset</span> for Ember Detection in Wildfire <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Hu, Xinan Ye, Yifei Liu, Souvik Kundu, Gourav Datta, Srikar Mutnuri, Namo Asavisanu, Nora Ayanian, Konstantinos Psounis, Peter Beerel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents "FireFly", a synthetic dataset for ember detection
+created using Unreal Engine 4 (UE4), designed to overcome the current lack of
+ember-specific training resources. To create the dataset, we present a tool
+that allows the automated generation of the synthetic labeled dataset with
+adjustable parameters, enabling data diversity from various environmental
+conditions, making the dataset both diverse and customizable based on user
+requirements. We generated a total of 19,273 frames that have been used to
+evaluate FireFly on four popular object detection models. Further to minimize
+human intervention, we leveraged a trained model to create a semi-automatic
+labeling process for real-life ember frames. Moreover, we demonstrated an up to
+8.57% improvement in mean Average Precision (mAP) in real-world wildfire
+scenarios compared to models trained exclusively on a small real dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Artificial Intelligence (AI) and Humanitarian Assistance and Disaster
+  Recovery (HADR) workshop, ICCV 2023 in Paris, France</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI-GOMS: Large AI-Driven Global Ocean Modeling System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Xiong, Yanfei Xiang, Hao Wu, Shuyi Zhou, Yuze Sun, Muyuan Ma, Xiaomeng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ocean modeling is a powerful tool for simulating the physical, chemical, and
+biological processes of the ocean, which is the foundation for marine science
+research and operational oceanography. Modern numerical ocean modeling mainly
+consists of governing equations and numerical algorithms. Nonlinear
+instability, computational expense, low reusability efficiency and high
+coupling costs have gradually become the main bottlenecks for the further
+development of numerical ocean modeling. Recently, artificial
+intelligence-based modeling in scientific computing has shown revolutionary
+potential for digital twins and scientific simulations, but the bottlenecks of
+numerical ocean modeling have not been further solved. Here, we present
+AI-GOMS, a large AI-driven global ocean modeling system, for accurate and
+efficient global ocean daily prediction. AI-GOMS consists of a backbone model
+with the Fourier-based Masked Autoencoder structure for basic ocean variable
+prediction and lightweight fine-tuning models incorporating regional
+downscaling, wave decoding, and biochemistry coupling modules. AI-GOMS has
+achieved the best performance in 30 days of prediction for the global ocean
+basic variables with 15 depth layers at 1/4{\deg} spatial resolution. Beyond
+the good performance in statistical metrics, AI-GOMS realizes the simulation of
+mesoscale eddies in the Kuroshio region at 1/12{\deg} spatial resolution and
+ocean stratification in the tropical Pacific Ocean. AI-GOMS provides a new
+backbone-downstream paradigm for Earth system modeling, which makes the system
+transferable, scalable and reusable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Directed Linear Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Vasilis Kontonis, Christos Tzamos, Nikos Zarifis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In online classification, a learner is presented with a sequence of examples
+and aims to predict their labels in an online fashion so as to minimize the
+total number of mistakes. In the self-directed variant, the learner knows in
+advance the pool of examples and can adaptively choose the order in which
+predictions are made. Here we study the power of choosing the prediction order
+and establish the first strong separation between worst-order and random-order
+learning for the fundamental task of linear classification. Prior to our work,
+such a separation was known only for very restricted concept classes, e.g.,
+one-dimensional thresholds or axis-aligned rectangles.
+  We present two main results. If $X$ is a dataset of $n$ points drawn
+uniformly at random from the $d$-dimensional unit sphere, we design an
+efficient self-directed learner that makes $O(d \log \log(n))$ mistakes and
+classifies the entire dataset. If $X$ is an arbitrary $d$-dimensional dataset
+of size $n$, we design an efficient self-directed learner that predicts the
+labels of $99\%$ of the points in $X$ with mistake bound independent of $n$. In
+contrast, under a worst- or random-ordering, the number of mistakes must be at
+least $\Omega(d \log n)$, even when the points are drawn uniformly from the
+unit sphere and the learner only needs to predict the labels for $1\%$ of them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Magnitude Pruning as a Renormalisation Group: A Study in The
+  Context of The Lottery Ticket Hypothesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abu-Al Hassan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This thesis delves into the intricate world of Deep Neural Networks (DNNs),
+focusing on the exciting concept of the Lottery Ticket Hypothesis (LTH). The
+LTH posits that within extensive DNNs, smaller, trainable subnetworks termed
+"winning tickets", can achieve performance comparable to the full model. A key
+process in LTH, Iterative Magnitude Pruning (IMP), incrementally eliminates
+minimal weights, emulating stepwise learning in DNNs. Once we identify these
+winning tickets, we further investigate their "universality". In other words,
+we check if a winning ticket that works well for one specific problem could
+also work well for other, similar problems. We also bridge the divide between
+the IMP and the Renormalisation Group (RG) theory in physics, promoting a more
+rigorous understanding of IMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MSci thesis, 40 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning-Rate-Free Learning: Dissecting D-Adaptation and Probabilistic
+  Line Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max McGuinness
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores two recent methods for learning rate optimisation in
+stochastic gradient descent: D-Adaptation (arXiv:2301.07733) and probabilistic
+line search (arXiv:1502.02846). These approaches aim to alleviate the burden of
+selecting an initial learning rate by incorporating distance metrics and
+Gaussian process posterior estimates, respectively. In this report, I provide
+an intuitive overview of both methods, discuss their shared design goals, and
+devise scope for merging the two algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09702v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09702v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Besta, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) are among the most powerful tools in deep
+learning. They routinely solve complex problems on unstructured networks, such
+as node classification, graph classification, or link prediction, with high
+accuracy. However, both inference and training of GNNs are complex, and they
+uniquely combine the features of irregular graph processing with dense and
+regular computations. This complexity makes it very challenging to execute GNNs
+efficiently on modern massively parallel architectures. To alleviate this, we
+first design a taxonomy of parallelism in GNNs, considering data and model
+parallelism, and different forms of pipelining. Then, we use this taxonomy to
+investigate the amount of parallelism in numerous GNN models, GNN-driven
+machine learning tasks, software frameworks, or hardware accelerators. We use
+the work-depth model, and we also assess communication volume and
+synchronization. We specifically focus on the sparsity/density of the
+associated tensors, in order to understand how to effectively apply techniques
+such as vectorization. We also formally analyze GNN pipelining, and we
+generalize the established Message-Passing class of GNN models to cover
+arbitrary pipeline depths, facilitating future optimizations. Finally, we
+investigate different forms of asynchronicity, navigating the path for future
+asynchronous parallel GNN pipelines. The outcomes of our analysis are
+synthesized in a set of insights that help to maximize GNN performance, and a
+comprehensive list of challenges and opportunities for further research into
+efficient GNN computations. Our work will help to advance the design of future
+GNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bounding the Probabilities of Benefit and Harm Through Sensitivity
+  Parameters and Proxies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05396v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05396v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose M. Peña
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present two methods for bounding the probabilities of benefit and harm
+under unmeasured confounding. The first method computes the (upper or lower)
+bound of either probability as a function of the observed data distribution and
+two intuitive sensitivity parameters which, then, can be presented to the
+analyst as a 2-D plot to assist her in decision making. The second method
+assumes the existence of a measured nondifferential proxy (i.e., direct effect)
+of the unmeasured confounder. Using this proxy, tighter bounds than the
+existing ones can be derived from just the observed data distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Similarity of Neural Network Models: A <span class="highlight-title">Survey</span> of Functional and
+  Representational Measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Klabunde, Tobias Schumacher, Markus Strohmaier, Florian Lemmerich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Measuring similarity of neural networks to understand and improve their
+behavior has become an issue of great importance and research interest. In this
+survey, we provide a comprehensive overview of two complementary perspectives
+of measuring neural network similarity: (i) representational similarity, which
+considers how activations of intermediate layers differ, and (ii) functional
+similarity, which considers how models differ in their outputs. In addition to
+providing detailed descriptions of existing measures, we summarize and discuss
+results on the properties of and relationships between these measures, and
+point to open research problems. We hope our work lays a foundation for more
+systematic research on the properties and applicability of similarity measures
+for neural network models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Comments welcome!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal EEG Representation Learning on Riemannian Manifold and
+  Euclidean Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2008.08633v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2008.08633v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyi Zhang, Ali Etemad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel deep neural architecture for learning electroencephalogram
+(EEG). To learn the spatial information, our model first obtains the Riemannian
+mean and distance from spatial covariance matrices (SCMs) on a Riemannian
+manifold. We then project the spatial information onto a Euclidean space via
+tangent space learning. Following, two fully connected layers are used to learn
+the spatial information embeddings. Moreover, our proposed method learns the
+temporal information via differential entropy and logarithm power spectrum
+density features extracted from EEG signals in a Euclidean space using a deep
+long short-term memory network with a soft attention mechanism. To combine the
+spatial and temporal information, we use an effective fusion strategy, which
+learns attention weights applied to embedding-specific features for decision
+making. We evaluate our proposed framework on four public datasets across three
+popular EEG-related tasks, notably emotion recognition, vigilance estimation,
+and motor imagery classification, containing various types of tasks such as
+binary classification, multi-class classification, and regression. Our proposed
+architecture outperforms other methods on SEED-VIG, and approaches the
+state-of-the-art on the other three datasets (SEED, BCI-IV 2A, and BCI-IV 2B),
+showing the robustness of our framework in EEG representation learning. The
+source code of our paper is publicly available at
+https://github.com/guangyizhangbci/EEG_Riemannian.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Emerging Topics in Computational
+  Intelligence. 15 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formally Explaining Neural Networks within Reactive Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahaf Bassan, Guy Amir, Davide Corsi, Idan Refaeli, Guy Katz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are increasingly being used as controllers in
+reactive systems. However, DNNs are highly opaque, which renders it difficult
+to explain and justify their actions. To mitigate this issue, there has been a
+surge of interest in explainable AI (XAI) techniques, capable of pinpointing
+the input features that caused the DNN to act as it did. Existing XAI
+techniques typically face two limitations: (i) they are heuristic, and do not
+provide formal guarantees that the explanations are correct; and (ii) they
+often apply to ``one-shot'' systems, where the DNN is invoked independently of
+past invocations, as opposed to reactive systems. Here, we begin bridging this
+gap, and propose a formal DNN-verification-based XAI technique for reasoning
+about multi-step, reactive systems. We suggest methods for efficiently
+calculating succinct explanations, by exploiting the system's transition
+constraints in order to curtail the search space explored by the underlying
+verifier. We evaluate our approach on two popular benchmarks from the domain of
+automated navigation; and observe that our methods allow the efficient
+computation of minimal and minimum explanations, significantly outperforming
+the state of the art. We also demonstrate that our methods produce formal
+explanations that are more reliable than competing, non-verification-based XAI
+techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Proc. 23rd Int. Conf. on Formal Methods in
+  Computer-Aided Design (FMCAD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Incentivize Information Acquisition: Proper Scoring Rules
+  Meet Principal-Agent Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08613v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08613v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Chen, Jibang Wu, Yifan Wu, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the incentivized information acquisition problem, where a principal
+hires an agent to gather information on her behalf. Such a problem is modeled
+as a Stackelberg game between the principal and the agent, where the principal
+announces a scoring rule that specifies the payment, and then the agent then
+chooses an effort level that maximizes her own profit and reports the
+information. We study the online setting of such a problem from the principal's
+perspective, i.e., designing the optimal scoring rule by repeatedly interacting
+with the strategic agent. We design a provably sample efficient algorithm that
+tailors the UCB algorithm (Auer et al., 2002) to our model, which achieves a
+sublinear $T^{2/3}$-regret after $T$ iterations. Our algorithm features a
+delicate estimation procedure for the optimal profit of the principal, and a
+conservative correction scheme that ensures the desired agent's actions are
+incentivized. Furthermore, a key feature of our regret bound is that it is
+independent of the number of states of the environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, adding an impossible result (Lemma 3.2) with its proof in
+  Section D.1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pareto-Secure Machine Learning (PSML): Fingerprinting and Securing
+  Inference Serving Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01292v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01292v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debopam Sanyal, Jui-Tse Hung, Manav Agrawal, Prahlad Jasti, Shahab Nikkhoo, Somesh Jha, Tianhao Wang, Sibin Mohan, Alexey Tumanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-serving systems have become increasingly popular, especially in
+real-time web applications. In such systems, users send queries to the server
+and specify the desired performance metrics (e.g., desired accuracy, latency).
+The server maintains a set of models (model zoo) in the back-end and serves the
+queries based on the specified metrics. This paper examines the security,
+specifically robustness against model extraction attacks, of such systems.
+Existing black-box attacks assume a single model can be repeatedly selected for
+serving inference requests. Modern inference serving systems break this
+assumption. Thus, they cannot be directly applied to extract a victim model, as
+models are hidden behind a layer of abstraction exposed by the serving system.
+An attacker can no longer identify which model she is interacting with. To this
+end, we first propose a query-efficient fingerprinting algorithm to enable the
+attacker to trigger any desired model consistently. We show that by using our
+fingerprinting algorithm, model extraction can have fidelity and accuracy
+scores within $1\%$ of the scores obtained when attacking a single, explicitly
+specified model, as well as up to $14.6\%$ gain in accuracy and up to $7.7\%$
+gain in fidelity compared to the naive attack. Second, we counter the proposed
+attack with a noise-based defense mechanism that thwarts fingerprinting by
+adding noise to the specified performance metrics. The proposed defense
+strategy reduces the attack's accuracy and fidelity by up to $9.8\%$ and
+$4.8\%$, respectively (on medium-sized model extraction). Third, we show that
+the proposed defense induces a fundamental trade-off between the level of
+protection and system goodput, achieving configurable and significant victim
+model extraction protection while maintaining acceptable goodput ($>80\%$). We
+implement the proposed defense in a real system with plans to open source.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAHALO: Unifying Offline Reinforcement Learning and Imitation Learning
+  from Observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17156v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17156v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anqi Li, Byron Boots, Ching-An Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a new paradigm for sequential decision making, called offline policy
+learning from observations (PLfO). Offline PLfO aims to learn policies using
+datasets with substandard qualities: 1) only a subset of trajectories is
+labeled with rewards, 2) labeled trajectories may not contain actions, 3)
+labeled trajectories may not be of high quality, and 4) the data may not have
+full coverage. Such imperfection is common in real-world learning scenarios,
+and offline PLfO encompasses many existing offline learning setups, including
+offline imitation learning (IL), offline IL from observations (ILfO), and
+offline reinforcement learning (RL). In this work, we present a generic
+approach to offline PLfO, called $\textbf{M}$odality-agnostic
+$\textbf{A}$dversarial $\textbf{H}$ypothesis $\textbf{A}$daptation for
+$\textbf{L}$earning from $\textbf{O}$bservations (MAHALO). Built upon the
+pessimism concept in offline RL, MAHALO optimizes the policy using a
+performance lower bound that accounts for uncertainty due to the dataset's
+insufficient coverage. We implement this idea by adversarially training
+data-consistent critic and reward functions, which forces the learned policy to
+be robust to data deficiency. We show that MAHALO consistently outperforms or
+matches specialized algorithms across a variety of offline PLfO tasks in theory
+and experiments. Our code is available at https://github.com/AnqiLi/mahalo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Counterfactual Inference for Consumer Choice Across Many Product
+  Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1906.02635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1906.02635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rob Donnelly, Francisco R. Ruiz, David Blei, Susan Athey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a method for estimating consumer preferences among
+discrete choices, where the consumer chooses at most one product in a category,
+but selects from multiple categories in parallel. The consumer's utility is
+additive in the different categories. Her preferences about product attributes
+as well as her price sensitivity vary across products and are in general
+correlated across products. We build on techniques from the machine learning
+literature on probabilistic models of matrix factorization, extending the
+methods to account for time-varying product attributes and products going out
+of stock. We evaluate the performance of the model using held-out data from
+weeks with price changes or out of stock products. We show that our model
+improves over traditional modeling approaches that consider each category in
+isolation. One source of the improvement is the ability of the model to
+accurately estimate heterogeneity in preferences (by pooling information across
+categories); another source of improvement is its ability to estimate the
+preferences of consumers who have rarely or never made a purchase in a given
+category in the training data. Using held-out data, we show that our model can
+accurately distinguish which consumers are most price sensitive to a given
+product. We consider counterfactuals such as personally targeted price
+discounts, showing that using a richer model such as the one we propose
+substantially increases the benefits of personalization in discounts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ASSET: Robust Backdoor Data Detection Across a Multiplicity of Deep
+  Learning Paradigms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minzhou Pan, Yi Zeng, Lingjuan Lyu, Xue Lin, Ruoxi Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor data detection is traditionally studied in an end-to-end supervised
+learning (SL) setting. However, recent years have seen the proliferating
+adoption of self-supervised learning (SSL) and transfer learning (TL), due to
+their lesser need for labeled data. Successful backdoor attacks have also been
+demonstrated in these new settings. However, we lack a thorough understanding
+of the applicability of existing detection methods across a variety of learning
+settings. By evaluating 56 attack settings, we show that the performance of
+most existing detection methods varies significantly across different attacks
+and poison ratios, and all fail on the state-of-the-art clean-label attack. In
+addition, they either become inapplicable or suffer large performance losses
+when applied to SSL and TL. We propose a new detection method called Active
+Separation via Offset (ASSET), which actively induces different model behaviors
+between the backdoor and clean samples to promote their separation. We also
+provide procedures to adaptively select the number of suspicious points to
+remove. In the end-to-end SL setting, ASSET is superior to existing methods in
+terms of consistency of defensive performance across different attacks and
+robustness to changes in poison ratios; in particular, it is the only method
+that can detect the state-of-the-art clean-label attack. Moreover, ASSET's
+average detection rates are higher than the best existing methods in SSL and
+TL, respectively, by 69.3% and 33.2%, thus providing the first practical
+backdoor defense for these new DL settings. We open-source the project to drive
+further development and encourage engagement:
+https://github.com/ruoxi-jia-group/ASSET.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, with 13 pages of main text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-BD: Post-Training Detection of Backdoor Attacks with Arbitrary
+  Backdoor Pattern Types Using a Maximum Margin Statistic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.06900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.06900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Wang, Zhen Xiang, David J. Miller, George Kesidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks are an important type of adversarial threat against deep
+neural network classifiers, wherein test samples from one or more source
+classes will be (mis)classified to the attacker's target class when a backdoor
+pattern is embedded. In this paper, we focus on the post-training backdoor
+defense scenario commonly considered in the literature, where the defender aims
+to detect whether a trained classifier was backdoor-attacked without any access
+to the training set. Many post-training detectors are designed to detect
+attacks that use either one or a few specific backdoor embedding functions
+(e.g., patch-replacement or additive attacks). These detectors may fail when
+the backdoor embedding function used by the attacker (unknown to the defender)
+is different from the backdoor embedding function assumed by the defender. In
+contrast, we propose a post-training defense that detects backdoor attacks with
+arbitrary types of backdoor embeddings, without making any assumptions about
+the backdoor embedding type. Our detector leverages the influence of the
+backdoor attack, independent of the backdoor embedding mechanism, on the
+landscape of the classifier's outputs prior to the softmax layer. For each
+class, a maximum margin statistic is estimated. Detection inference is then
+performed by applying an unsupervised anomaly detector to these statistics.
+Thus, our detector does not need any legitimate clean samples, and can
+efficiently detect backdoor attacks with arbitrary numbers of source classes.
+These advantages over several state-of-the-art methods are demonstrated on four
+datasets, for three different types of backdoor patterns, and for a variety of
+attack configurations. Finally, we propose a novel, general approach for
+backdoor mitigation once a detection is made. The mitigation approach was the
+runner-up at the first IEEE Trojan Removal Competition. The code is online
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phase-shifted Adversarial Training <span class="chip">UAI
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeachan Kim, Seongyeon Kim, Ihyeok Seo, Bonggun Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training has been considered an imperative component for safely
+deploying neural network-based applications to the real world. To achieve
+stronger robustness, existing methods primarily focus on how to generate strong
+attacks by increasing the number of update steps, regularizing the models with
+the smoothed loss function, and injecting the randomness into the attack.
+Instead, we analyze the behavior of adversarial training through the lens of
+response frequency. We empirically discover that adversarial training causes
+neural networks to have low convergence to high-frequency information,
+resulting in highly oscillated predictions near each data. To learn
+high-frequency contents efficiently and effectively, we first prove that a
+universal phenomenon of frequency principle, i.e., \textit{lower frequencies
+are learned first}, still holds in adversarial training. Based on that, we
+propose phase-shifted adversarial training (PhaseAT) in which the model learns
+high-frequency components by shifting these frequencies to the low-frequency
+range where the fast convergence occurs. For evaluations, we conduct the
+experiments on CIFAR-10 and ImageNet with the adaptive attack carefully
+designed for reliable evaluation. Comprehensive results show that PhaseAT
+significantly improves the convergence for high-frequency information. This
+results in improved adversarial robustness by enabling the model to have
+smoothed predictions near each data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of Uncertainty in Artificial Intelligence, 2023 (UAI
+  2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human-M3: A Multi-view Multi-modal <span class="highlight-title">Dataset</span> for 3D Human Pose Estimation
+  in Outdoor Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Fan, Siqi Wang, Wenxuan Guo, Wenzhao Zheng, Jianjiang Feng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation in outdoor environments has garnered increasing
+attention recently. However, prevalent 3D human pose datasets pertaining to
+outdoor scenes lack diversity, as they predominantly utilize only one type of
+modality (RGB image or pointcloud), and often feature only one individual
+within each scene. This limited scope of dataset infrastructure considerably
+hinders the variability of available data. In this article, we propose
+Human-M3, an outdoor multi-modal multi-view multi-person human pose database
+which includes not only multi-view RGB videos of outdoor scenes but also
+corresponding pointclouds. In order to obtain accurate human poses, we propose
+an algorithm based on multi-modal data input to generate ground truth
+annotation. This benefits from robust pointcloud detection and tracking, which
+solves the problem of inaccurate human localization and matching ambiguity that
+may exist in previous multi-view RGB videos in outdoor multi-person scenes, and
+generates reliable ground truth annotations. Evaluation of multiple different
+modalities algorithms has shown that this database is challenging and suitable
+for future research. Furthermore, we propose a 3D human pose estimation
+algorithm based on multi-modal data input, which demonstrates the advantages of
+multi-modal data input for 3D human pose estimation. Code and data will be
+released on https://github.com/soullessrobot/Human-M3-Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data will be released on
+  https://github.com/soullessrobot/Human-M3-Dataset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Introduction to Online Convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1909.05207v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1909.05207v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elad Hazan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This manuscript portrays optimization as a process. In many practical
+applications the environment is so complex that it is infeasible to lay out a
+comprehensive theoretical model and use classical algorithmic theory and
+mathematical optimization. It is necessary as well as beneficial to take a
+robust approach, by applying an optimization method that learns as one goes
+along, learning from experience as more aspects of the problem are observed.
+This view of optimization as a process has become prominent in varied fields
+and has led to some spectacular success in modeling and systems that are now
+part of our daily lives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1909.03550</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training neural networks with structured noise improves classification
+  and generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13417v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13417v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Benedetti, Enrico Ventura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The beneficial role of noise in learning is nowadays a consolidated concept
+in the field of artificial neural networks, suggesting that even biological
+systems might take advantage of similar mechanisms to maximize their
+performance. The training-with-noise algorithm proposed by Gardner and
+collaborators is an emblematic example of a noise injection procedure in
+recurrent networks, which are usually employed to model real neural systems. We
+show how adding structure into noisy training data can substantially improve
+the algorithm performance, allowing to approach perfect classification and
+maximal basins of attraction. We also prove that the so-called Hebbian
+unlearning rule coincides with the training-with-noise algorithm when noise is
+maximal and data are fixed points of the network dynamics. A sampling scheme
+for optimal noisy data is eventually proposed and implemented to outperform
+both the training-with-noise and the Hebbian unlearning procedures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 17 figures, main text and appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pretrain</span>ing is All You Need: A Multi-Atlas Enhanced <span class="highlight-title">Transformer</span>
+  Framework for Autism Spectrum Disorder Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Mahler, Qi Wang, Julius Steiglechner, Florian Birk, Samuel Heczko, Klaus Scheffler, Gabriele Lohmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autism spectrum disorder (ASD) is a prevalent psychiatric condition
+characterized by atypical cognitive, emotional, and social patterns. Timely and
+accurate diagnosis is crucial for effective interventions and improved outcomes
+in individuals with ASD. In this study, we propose a novel Multi-Atlas Enhanced
+Transformer framework, METAFormer, ASD classification. Our framework utilizes
+resting-state functional magnetic resonance imaging data from the ABIDE I
+dataset, comprising 406 ASD and 476 typical control (TC) subjects. METAFormer
+employs a multi-atlas approach, where flattened connectivity matrices from the
+AAL, CC200, and DOS160 atlases serve as input to the transformer encoder.
+Notably, we demonstrate that self-supervised pretraining, involving the
+reconstruction of masked values from the input, significantly enhances
+classification performance without the need for additional or separate training
+data. Through stratified cross-validation, we evaluate the proposed framework
+and show that it surpasses state-of-the-art performance on the ABIDE I dataset,
+with an average accuracy of 83.7% and an AUC-score of 0.832. The code for our
+framework is available at https://github.com/Lugges991/METAFormer
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigation of <span class="highlight-title">Self-supervised</span> <span class="highlight-title">Pre-train</span>ed Models for Classification
+  of Voice Quality from Speech and Neck Surface Accelerometer Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudarsana Reddy Kadiri, Farhad Javanmardi, Paavo Alku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior studies in the automatic classification of voice quality have mainly
+studied the use of the acoustic speech signal as input. Recently, a few studies
+have been carried out by jointly using both speech and neck surface
+accelerometer (NSA) signals as inputs, and by extracting MFCCs and glottal
+source features. This study examines simultaneously-recorded speech and NSA
+signals in the classification of voice quality (breathy, modal, and pressed)
+using features derived from three self-supervised pre-trained models
+(wav2vec2-BASE, wav2vec2-LARGE, and HuBERT) and using a SVM as well as CNNs as
+classifiers. Furthermore, the effectiveness of the pre-trained models is
+compared in feature extraction between glottal source waveforms and raw signal
+waveforms for both speech and NSA inputs. Using two signal processing methods
+(quasi-closed phase (QCP) glottal inverse filtering and zero frequency
+filtering (ZFF)), glottal source waveforms are estimated from both speech and
+NSA signals. The study has three main goals: (1) to study whether features
+derived from pre-trained models improve classification accuracy compared to
+conventional features (spectrogram, mel-spectrogram, MFCCs, i-vector, and
+x-vector), (2) to investigate which of the two modalities (speech vs. NSA) is
+more effective in the classification task with pre-trained model-based
+features, and (3) to evaluate whether the deep learning-based CNN classifier
+can enhance the classification accuracy in comparison to the SVM classifier.
+The results revealed that the use of the NSA input showed better classification
+performance compared to the speech signal. Between the features, the
+pre-trained model-based features showed better classification accuracies, both
+for speech and NSA inputs compared to the conventional features. It was also
+found that the HuBERT features performed better than the wav2vec2-BASE and
+wav2vec2-LARGE features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Computer Speech & Language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaCast: A Self-Driven Metaverse Announcer Architecture Based on
+  Quality of Experience Evaluation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghao Lin, Haihan Duan, Jiaye Li, Xinyao Sun, Wei Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metaverse provides users with a novel experience through immersive multimedia
+technologies. Along with the rapid user growth, numerous events bursting in the
+metaverse necessitate an announcer to help catch and monitor ongoing events.
+However, systems on the market primarily serve for esports competitions and
+rely on human directors, making it challenging to provide 24-hour delivery in
+the metaverse persistent world. To fill the blank, we proposed a three-stage
+architecture for metaverse announcers, which is designed to identify events,
+position cameras, and blend between shots. Based on the architecture, we
+introduced a Metaverse Announcer User Experience (MAUE) model to identify the
+factors affecting the users' Quality of Experience (QoE) from a human-centered
+perspective. In addition, we implemented \textit{MetaCast}, a practical
+self-driven metaverse announcer in a university campus metaverse prototype, to
+conduct user studies for MAUE model. The experimental results have effectively
+achieved satisfactory announcer settings that align with the preferences of
+most users, encompassing parameters such as video transition rate, repetition
+rate, importance threshold value, and image composition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-Guided Feature Distillation for Multimodal Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Liu, Huilin Chen, Zhiyong Cheng, Liqiang Nie, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal recommendation exploits the rich multimodal information associated
+with users or items to enhance the representation learning for better
+performance. In these methods, end-to-end feature extractors (e.g.,
+shallow/deep neural networks) are often adopted to tailor the generic
+multimodal features that are extracted from raw data by pre-trained models for
+recommendation. However, compact extractors, such as shallow neural networks,
+may find it challenging to extract effective information from complex and
+high-dimensional generic modality features. Conversely, DNN-based extractors
+may encounter the data sparsity problem in recommendation. To address this
+problem, we propose a novel model-agnostic approach called Semantic-guided
+Feature Distillation (SGFD), which employs a teacher-student framework to
+extract feature for multimodal recommendation. The teacher model first extracts
+rich modality features from the generic modality feature by considering both
+the semantic information of items and the complementary information of multiple
+modalities. SGFD then utilizes response-based and feature-based distillation
+loss to effectively transfer the knowledge encoded in the teacher model to the
+student model. To evaluate the effectiveness of our SGFD, we integrate SGFD
+into three backbone multimodal recommendation models. Extensive experiments on
+three public real-world datasets demonstrate that SGFD-enhanced models can
+achieve substantial improvement over their counterparts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M$^3$Net: Multi-view Encoding, Matching, and Fusion for Few-shot
+  Fine-grained Action Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Tang, Jun Liu, Shuanglin Yan, Rui Yan, Zechao Li, Jinhui Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the scarcity of manually annotated data required for fine-grained
+video understanding, few-shot fine-grained (FS-FG) action recognition has
+gained significant attention, with the aim of classifying novel fine-grained
+action categories with only a few labeled instances. Despite the progress made
+in FS coarse-grained action recognition, current approaches encounter two
+challenges when dealing with the fine-grained action categories: the inability
+to capture subtle action details and the insufficiency of learning from limited
+data that exhibit high intra-class variance and inter-class similarity. To
+address these limitations, we propose M$^3$Net, a matching-based framework for
+FS-FG action recognition, which incorporates \textit{multi-view encoding},
+\textit{multi-view matching}, and \textit{multi-view fusion} to facilitate
+embedding encoding, similarity matching, and decision making across multiple
+viewpoints. \textit{Multi-view encoding} captures rich contextual details from
+the intra-frame, intra-video, and intra-episode perspectives, generating
+customized higher-order embeddings for fine-grained data. \textit{Multi-view
+matching} integrates various matching functions enabling flexible relation
+modeling within limited samples to handle multi-scale spatio-temporal
+variations by leveraging the instance-specific, category-specific, and
+task-specific perspectives. \textit{Multi-view fusion} consists of
+matching-predictions fusion and matching-losses fusion over the above views,
+where the former promotes mutual complementarity and the latter enhances
+embedding generalizability by employing multi-task collaborative learning.
+Explainable visualizations and experimental results on three challenging
+benchmarks demonstrate the superiority of M$^3$Net in capturing fine-grained
+action details and achieving state-of-the-art performance for FS-FG action
+recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Scene-Text to Scene-Text Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Onkar Susladkar, Prajwal Gatti, Anand Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study the task of ``visually" translating scene text from a
+source language (e.g., English) to a target language (e.g., Chinese). Visual
+translation involves not just the recognition and translation of scene text but
+also the generation of the translated image that preserves visual features of
+the text, such as font, size, and background. There are several challenges
+associated with this task, such as interpolating font to unseen characters and
+preserving text size and the background. To address these, we introduce VTNet,
+a novel conditional diffusion-based method. To train the VTNet, we create a
+synthetic cross-lingual dataset of 600K samples of scene text images in six
+popular languages, including English, Hindi, Tamil, Chinese, Bengali, and
+German. We evaluate the performance of VTnet through extensive experiments and
+comparisons to related methods. Our model also surpasses the previous
+state-of-the-art results on the conventional scene-text editing benchmarks.
+Further, we present rigorous qualitative studies to understand the strengths
+and shortcomings of our model. Results show that our approach generalizes well
+to unseen words and fonts. We firmly believe our work can benefit real-world
+applications, such as text translation using a phone camera and translating
+educational materials. Code and data will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StyleEDL: Style-Guided High-order Attention Network for Image Emotion
+  Distribution Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiguang Jing, Xianyi Liu, Ji Wang, Yinwei Wei, Liqiang Nie, Yuting Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion distribution learning has gained increasing attention with the
+tendency to express emotions through images. As for emotion ambiguity arising
+from humans' subjectivity, substantial previous methods generally focused on
+learning appropriate representations from the holistic or significant part of
+images. However, they rarely consider establishing connections with the
+stylistic information although it can lead to a better understanding of images.
+In this paper, we propose a style-guided high-order attention network for image
+emotion distribution learning termed StyleEDL, which interactively learns
+stylistic-aware representations of images by exploring the hierarchical
+stylistic information of visual contents. Specifically, we consider exploring
+the intra- and inter-layer correlations among GRAM-based stylistic
+representations, and meanwhile exploit an adversary-constrained high-order
+attention mechanism to capture potential interactions between subtle visual
+parts. In addition, we introduce a stylistic graph convolutional network to
+dynamically generate the content-dependent emotion representations to benefit
+the final emotion distribution learning. Extensive experiments conducted on
+several benchmark datasets demonstrate the effectiveness of our proposed
+StyleEDL compared to state-of-the-art methods. The implementation is released
+at: https://github.com/liuxianyi/StyleEDL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Vision-Language <span class="highlight-title">Pre-Train</span>ing with Jointly Learned Questioner
+  and Dense Captioner <span class="chip">ACM MM '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11769v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11769v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikang Liu, Sihan Chen, Longteng Guo, Handong Li, Xingjian He, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained multimodal models have demonstrated significant success in
+a range of downstream tasks, including image captioning, image-text retrieval,
+visual question answering (VQA), etc. However, many of these methods rely on
+image-text pairs collected from the web as pre-training data and unfortunately
+overlook the need for fine-grained feature alignment between vision and
+language modalities, which requires detailed understanding of images and
+language expressions. While integrating VQA and dense captioning (DC) into
+pre-training can address this issue, acquiring image-question-answer as well as
+image-location-caption triplets is challenging and time-consuming.
+Additionally, publicly available datasets for VQA and dense captioning are
+typically limited in scale due to manual data collection and labeling efforts.
+In this paper, we propose a novel method called Joint QA and DC GEneration
+(JADE), which utilizes a pre-trained multimodal model and easily-crawled
+image-text pairs to automatically generate and filter large-scale VQA and dense
+captioning datasets. We apply this method to the Conceptual Caption (CC3M)
+dataset to generate a new dataset called CC3M-QA-DC. Experiments show that when
+used for pre-training in a multi-task manner, CC3M-QA-DC can improve the
+performance with various backbones on various downstream tasks. Furthermore,
+our generated CC3M-QA-DC can be combined with larger image-text datasets (e.g.,
+CC15M) and achieve competitive results compared with models using much more
+data. Code and dataset are available at
+https://github.com/johncaged/OPT_Questioner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. Accepted by ACM MM '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChinaOpen: A <span class="highlight-title">Dataset</span> for Open-world Multimodal Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aozhu Chen, Ziyuan Wang, Chengbo Dong, Kaibin Tian, Ruixiang Zhao, Xun Liang, Zhanhui Kang, Xirong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces ChinaOpen, a dataset sourced from Bilibili, a popular
+Chinese video-sharing website, for open-world multimodal learning. While the
+state-of-the-art multimodal learning networks have shown impressive performance
+in automated video annotation and cross-modal video retrieval, their training
+and evaluation are primarily conducted on YouTube videos with English text.
+Their effectiveness on Chinese data remains to be verified. In order to support
+multimodal learning in the new context, we construct ChinaOpen-50k, a webly
+annotated training set of 50k Bilibili videos associated with user-generated
+titles and tags. Both text-based and content-based data cleaning are performed
+to remove low-quality videos in advance. For a multi-faceted evaluation, we
+build ChinaOpen-1k, a manually labeled test set of 1k videos. Each test video
+is accompanied with a manually checked user title and a manually written
+caption. Besides, each video is manually tagged to describe objects / actions /
+scenes shown in the visual content. The original user tags are also manually
+checked. Moreover, with all the Chinese text translated into English,
+ChinaOpen-1k is also suited for evaluating models trained on English data. In
+addition to ChinaOpen, we propose Generative Video-to-text Transformer (GVT)
+for Chinese video captioning. We conduct an extensive evaluation of the
+state-of-the-art single-task / multi-task models on the new dataset, resulting
+in a number of novel findings and insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-based Knowledge Augmented Vision Language <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahua Rao, Zifei Shan, Longpo Liu, Yao Zhou, Yuedong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent progress in large-scale vision and language representation
+learning, Vision Language Pre-training (VLP) models have achieved promising
+improvements on various multi-modal downstream tasks. Albeit powerful, these
+models have not fully leveraged world knowledge to their advantage. A key
+challenge of knowledge-augmented VLP is the lack of clear connections between
+knowledge and multi-modal data. Moreover, not all knowledge present in
+images/texts is useful, therefore prior approaches often struggle to
+effectively integrate knowledge, visual, and textual information. In this
+study, we propose REtrieval-based knowledge Augmented Vision Language (REAVL),
+a novel knowledge-augmented pre-training framework to address the above issues.
+For the first time, we introduce a knowledge-aware self-supervised learning
+scheme that efficiently establishes the correspondence between knowledge and
+multi-modal data and identifies informative knowledge to improve the modeling
+of alignment and interactions between visual and textual modalities. By
+adaptively integrating informative knowledge with visual and textual
+information, REAVL achieves new state-of-the-art performance uniformly on
+knowledge-based vision-language understanding and multi-modal entity linking
+tasks, as well as competitive results on general vision-language tasks while
+only using 0.2% pre-training data of the best models. Our model shows strong
+sample efficiency and effective knowledge utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2210.09338 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Combating Online Misinformation Videos: Characterization, Detection, and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03242v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03242v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyan Bu, Qiang Sheng, Juan Cao, Peng Qi, Danding Wang, Jintao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With information consumption via online video streaming becoming increasingly
+popular, misinformation video poses a new threat to the health of the online
+information ecosystem. Though previous studies have made much progress in
+detecting misinformation in text and image formats, video-based misinformation
+brings new and unique challenges to automatic detection systems: 1) high
+information heterogeneity brought by various modalities, 2) blurred distinction
+between misleading video manipulation and nonmalicious artistic video editing,
+and 3) new patterns of misinformation propagation due to the dominant role of
+recommendation systems on online video platforms. To facilitate research on
+this challenging task, we conduct this survey to present advances in
+misinformation video detection. We first analyze and characterize the
+misinformation video from three levels including signals, semantics, and
+intents. Based on the characterization, we systematically review existing works
+for detection from features of various modalities to techniques for clue
+integration. We also introduce existing resources including representative
+datasets and useful tools. Besides summarizing existing studies, we discuss
+related areas and outline open issues and future directions to encourage and
+guide more research on misinformation video detection. The corresponding
+repository is at https://github.com/ICTMCG/Awesome-Misinfo-Video-Detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia 2023 (MM 2023). 11 pages, 4 figures, and
+  89 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-05T00:00:00Z">2023-08-05</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">15</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Science and engineering for what? A large-scale analysis of students'
+  projects in science fairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adelmo Eloy, Thomas Palmeira Ferraz, Fellip Silva Alves, Roseli de Deus Lopes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Science and Engineering fairs offer K-12 students opportunities to engage
+with authentic STEM practices. Particularly, students are given the chance to
+experience authentic and open inquiry processes, by defining which themes,
+questions and approaches will guide their scientific endeavors. In this study,
+we analyzed data from over 5,000 projects presented at a nationwide science
+fair in Brazil over the past 20 years using topic modeling to identify the main
+topics that have driven students' inquiry and design. Our analysis identified a
+broad range of topics being explored, with significant variations over time,
+region, and school setting. We argue those results and proposed methodology can
+not only support further research in the context of science fairs, but also
+inform instruction and design of contexts-specific resources to support
+students in open inquiry experiences in different settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at International Conference of the Learning Sciences - ICLS
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Source (Pre-)Training for Cross-Domain Measurement, Unit and
+  Context Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueling Li, Sebastian Martschat, Simone Paolo Ponzetto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a cross-domain approach for automated measurement and context
+extraction based on pre-trained language models. We construct a multi-source,
+multi-domain corpus and train an end-to-end extraction pipeline. We then apply
+multi-source task-adaptive pre-training and fine-tuning to benchmark the
+cross-domain generalization capability of our model. Further, we conceptualize
+and apply a task-specific error analysis and derive insights for future work.
+Our results suggest that multi-source training leads to the best overall
+results, while single-source training yields the best results for the
+respective individual domain. While our setup is successful at extracting
+quantity values and units, more research is needed to improve the extraction of
+contextual entities. We make the cross-domain corpus used in this work
+available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a workshop paper at BioNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Consistency Filtering-Free Unsupervised Learning for Dense
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxiang Shi, Sumio Fujita, Tetsuya Sakai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain transfer is a prevalent challenge in modern neural Information
+Retrieval (IR). To overcome this problem, previous research has utilized
+domain-specific manual annotations and synthetic data produced by consistency
+filtering to finetune a general ranker and produce a domain-specific ranker.
+However, training such consistency filters are computationally expensive, which
+significantly reduces the model efficiency. In addition, consistency filtering
+often struggles to identify retrieval intentions and recognize query and corpus
+distributions in a target domain. In this study, we evaluate a more efficient
+solution: replacing the consistency filter with either direct pseudo-labeling,
+pseudo-relevance feedback, or unsupervised keyword generation methods for
+achieving consistent filtering-free unsupervised dense retrieval. Our extensive
+experimental evaluations demonstrate that, on average, TextRank-based pseudo
+relevance feedback outperforms other methods. Furthermore, we analyzed the
+training and inference efficiency of the proposed paradigm. The results
+indicate that filtering-free unsupervised learning can continuously improve
+training and inference efficiency while maintaining retrieval performance. In
+some cases, it can even improve performance based on particular datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LaDA: Latent Dialogue Action For Zero-shot Cross-lingual Neural Network
+  Language Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanyu Ma, Jian Ye, Shuang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-lingual adaptation has proven effective in spoken language
+understanding (SLU) systems with limited resources. Existing methods are
+frequently unsatisfactory for intent detection and slot filling, particularly
+for distant languages that differ significantly from the source language in
+scripts, morphology, and syntax. Latent Dialogue Action (LaDA) layer is
+proposed to optimize decoding strategy in order to address the aforementioned
+issues. The model consists of an additional layer of latent dialogue action. It
+enables our model to improve a system's capability of handling conversations
+with complex multilingual intent and slot values of distant languages. To the
+best of our knowledge, this is the first exhaustive investigation of the use of
+latent variables for optimizing cross-lingual SLU policy during the decode
+stage. LaDA obtains state-of-the-art results on public datasets for both
+zero-shot and few-shot adaptation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted In Proceedings of Cognitive Science Society Annual
+  Conference (CogSci) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ApproBiVT: Lead ASR Models to Generalize Better Using Approximated
+  Bias-Variance Tradeoff Guided Early Stopping and Checkpoint Averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangyuan Wang, Ming Hao, Yuhai Shi, Bo Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional recipe for Automatic Speech Recognition (ASR) models is to
+1) train multiple checkpoints on a training set while relying on a validation
+set to prevent overfitting using early stopping and 2) average several last
+checkpoints or that of the lowest validation losses to obtain the final model.
+In this paper, we rethink and update the early stopping and checkpoint
+averaging from the perspective of the bias-variance tradeoff. Theoretically,
+the bias and variance represent the fitness and variability of a model and the
+tradeoff of them determines the overall generalization error. But, it's
+impractical to evaluate them precisely. As an alternative, we take the training
+loss and validation loss as proxies of bias and variance and guide the early
+stopping and checkpoint averaging using their tradeoff, namely an Approximated
+Bias-Variance Tradeoff (ApproBiVT). When evaluating with advanced ASR models,
+our recipe provides 2.5%-3.7% and 3.1%-4.6% CER reduction on the AISHELL-1 and
+AISHELL-2, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EduChat: A Large-Scale Language Model-based Chatbot System for
+  Intelligent Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Dan, Zhikai Lei, Yiyang Gu, Yong Li, Jianghao Yin, Jiaju Lin, Linhao Ye, Zhiyan Tie, Yougen Zhou, Yilei Wang, Aimin Zhou, Ze Zhou, Qin Chen, Jie Zhou, Liang He, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  EduChat (https://www.educhat.top/) is a large-scale language model
+(LLM)-based chatbot system in the education domain. Its goal is to support
+personalized, fair, and compassionate intelligent education, serving teachers,
+students, and parents. Guided by theories from psychology and education, it
+further strengthens educational functions such as open question answering,
+essay assessment, Socratic teaching, and emotional support based on the
+existing basic LLMs. Particularly, we learn domain-specific knowledge by
+pre-training on the educational corpus and stimulate various skills with tool
+use by fine-tuning on designed system prompts and instructions. Currently,
+EduChat is available online as an open-source project, with its code, data, and
+model parameters available on platforms (e.g., GitHub
+https://github.com/icalk-nlp/EduChat, Hugging Face
+https://huggingface.co/ecnu-icalk ). We also prepare a demonstration of its
+capabilities online (https://vimeo.com/851004454). This initiative aims to
+promote research and applications of LLMs for intelligent education.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Textual Data Mining for Financial Fraud Detection: A Deep Learning
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuru Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this report, I present a deep learning approach to conduct a natural
+language processing (hereafter NLP) binary classification task for analyzing
+financial-fraud texts. First, I searched for regulatory announcements and
+enforcement bulletins from HKEX news to define fraudulent companies and to
+extract their MD&A reports before I organized the sentences from the reports
+with labels and reporting time. My methodology involved different kinds of
+neural network models, including Multilayer Perceptrons with Embedding layers,
+vanilla Recurrent Neural Network (RNN), Long-Short Term Memory (LSTM), and
+Gated Recurrent Unit (GRU) for the text classification task. By utilizing this
+diverse set of models, I aim to perform a comprehensive comparison of their
+accuracy in detecting financial fraud. My results bring significant
+implications for financial fraud detection as this work contributes to the
+growing body of research at the intersection of deep learning, NLP, and
+finance, providing valuable insights for industry practitioners, regulators,
+and researchers in the pursuit of more robust and effective fraud detection
+methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Design and Implementation of English To Yorùbá Verb Phrase Machine
+  Translation System <span class="chip">EACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.04125v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.04125v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Ajibade, Safiriyu Eludiora
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to develop an English-to-Yoruba machine translation system which can
+translate English verb phrase text to its Yoruba equivalent.Words from both
+languages Source Language and Target Language were collected for the verb
+phrase group in the home domain. The lexical translation is done by assigning
+values of the matching word in the dictionary. The syntax of the two languages
+was realized using Context-Free Grammar, we validated the rewrite rules with
+finite state automata. The human evaluation method was used and expert fluency
+was scored. The evaluation shows the system performed better than that of
+sampled Google translation with over 70 percent of the response matching that
+of the system's output.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the African NLP Workshop at the 16th conference of the
+  European Chapter of the Association for Computational Linguistics (EACL) in
+  2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ eP-ALM: Efficient Perceptual Augmentation of Language Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11403v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11403v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Shukor, Corentin Dancette, Matthieu Cord
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have so far impressed the world, with
+unprecedented capabilities that emerge in models at large scales. On the vision
+side, transformer models (i.e., ViT) are following the same trend, achieving
+the best performance on challenging benchmarks. With the abundance of such
+unimodal models, a natural question arises; do we need also to follow this
+trend to tackle multimodal tasks? In this work, we propose to rather direct
+effort to efficient adaptations of existing models, and propose to augment
+Language Models with perception. Existing approaches for adapting pretrained
+models for vision-language tasks still rely on several key components that
+hinder their efficiency. In particular, they still train a large number of
+parameters, rely on large multimodal pretraining, use encoders (e.g., CLIP)
+trained on huge image-text datasets, and add significant inference overhead. In
+addition, most of these approaches have focused on Zero-Shot and In Context
+Learning, with little to no effort on direct finetuning. We investigate the
+minimal computational effort needed to adapt unimodal models for multimodal
+tasks and propose a new challenging setup, alongside different approaches, that
+efficiently adapts unimodal pretrained models. We show that by freezing more
+than 99% of total parameters, training only one linear projection layer, and
+prepending only one trainable token, our approach (dubbed eP-ALM) significantly
+outperforms other baselines on VQA and Captioning across Image, Video, and
+Audio modalities, following the proposed setup. The code is available here:
+https://github.com/mshukor/eP-ALM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023. Project page:
+  https://mshukor.github.io/eP-ALM.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $C^3$: Compositional Counterfactual Contrastive Learning for
+  Video-grounded Dialogues <span class="chip">SIGDIAL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.08914v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.08914v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Le, Nancy F. Chen, Steven C. H. Hoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-grounded dialogue systems aim to integrate video understanding and
+dialogue understanding to generate responses that are relevant to both the
+dialogue and video context. Most existing approaches employ deep learning
+models and have achieved remarkable performance, given the relatively small
+datasets available. However, the results are partly accomplished by exploiting
+biases in the datasets rather than developing multimodal reasoning, resulting
+in limited generalization. In this paper, we propose a novel approach of
+Compositional Counterfactual Contrastive Learning ($C^3$) to develop
+contrastive training between factual and counterfactual samples in
+video-grounded dialogues. Specifically, we design factual/counterfactual
+sampling based on the temporal steps in videos and tokens in dialogues and
+propose contrastive loss functions that exploit object-level or action-level
+variance. Different from prior approaches, we focus on contrastive hidden state
+representations among compositional output tokens to optimize the
+representation space in a generation setting. We achieved promising performance
+gains on the Audio-Visual Scene-Aware Dialogues (AVSD) benchmark and showed the
+benefits of our approach in grounding video and dialogue context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24th Meeting of the Special Interest Group on Discourse and Dialogue
+  (SIGDIAL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recommender Systems in the Era of Large Language Models (LLMs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Fan, Zihuai Zhao, Jiatong Li, Yunqing Liu, Xiaowei Mei, Yiqi Wang, Zhen Wen, Fei Wang, Xiangyu Zhao, Jiliang Tang, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prosperity of e-commerce and web applications, Recommender Systems
+(RecSys) have become an important component of our daily life, providing
+personalized suggestions that cater to user preferences. While Deep Neural
+Networks (DNNs) have made significant advancements in enhancing recommender
+systems by modeling user-item interactions and incorporating textual side
+information, DNN-based methods still face limitations, such as difficulties in
+understanding users' interests and capturing textual side information,
+inabilities in generalizing to various recommendation scenarios and reasoning
+on their predictions, etc. Meanwhile, the emergence of Large Language Models
+(LLMs), such as ChatGPT and GPT4, has revolutionized the fields of Natural
+Language Processing (NLP) and Artificial Intelligence (AI), due to their
+remarkable abilities in fundamental responsibilities of language understanding
+and generation, as well as impressive generalization and reasoning
+capabilities. As a result, recent studies have attempted to harness the power
+of LLMs to enhance recommender systems. Given the rapid evolution of this
+research direction in recommender systems, there is a pressing need for a
+systematic overview that summarizes existing LLM-empowered recommender systems,
+to provide researchers in relevant fields with an in-depth understanding.
+Therefore, in this paper, we conduct a comprehensive review of LLM-empowered
+recommender systems from various aspects including Pre-training, Fine-tuning,
+and Prompting. More specifically, we first introduce representative methods to
+harness the power of LLMs (as a feature encoder) for learning representations
+of users and items. Then, we review recent techniques of LLMs for enhancing
+recommender systems from three paradigms, namely pre-training, fine-tuning, and
+prompting. Finally, we comprehensively discuss future directions in this
+emerging field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diverse and Faithful Knowledge-Grounded Dialogue Generation via
+  Sequential Posterior Inference <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Xu, Deqian Kong, Dehong Xu, Ziwei Ji, Bo Pang, Pascale Fung, Ying Nian Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The capability to generate responses with diversity and faithfulness using
+factual knowledge is paramount for creating a human-like, trustworthy dialogue
+system. Common strategies either adopt a two-step paradigm, which optimizes
+knowledge selection and response generation separately, and may overlook the
+inherent correlation between these two tasks, or leverage conditional
+variational method to jointly optimize knowledge selection and response
+generation by employing an inference network. In this paper, we present an
+end-to-end learning framework, termed Sequential Posterior Inference (SPI),
+capable of selecting knowledge and generating dialogues by approximately
+sampling from the posterior distribution. Unlike other methods, SPI does not
+require the inference network or assume a simple geometry of the posterior
+distribution. This straightforward and intuitive inference procedure of SPI
+directly queries the response generation model, allowing for accurate knowledge
+selection and generation of faithful responses. In addition to modeling
+contributions, our experimental results on two common dialogue datasets (Wizard
+of Wikipedia and Holl-E) demonstrate that SPI outperforms previous strong
+baselines according to both automatic and human evaluation metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Summaries as Captions: Generating Figure Captions for Scientific
+  Documents with Automated Text Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12324v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12324v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chieh-Yang Huang, Ting-Yao Hsu, Ryan Rossi, Ani Nenkova, Sungchul Kim, Gromit Yeuk-Yin Chan, Eunyee Koh, Clyde Lee Giles, Ting-Hao 'Kenneth' Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Good figure captions help paper readers understand complex scientific
+figures. Unfortunately, even published papers often have poorly written
+captions. Automatic caption generation could aid paper writers by providing
+good starting captions that can be refined for better quality. Prior work often
+treated figure caption generation as a vision-to-language task. In this paper,
+we show that it can be more effectively tackled as a text summarization task in
+scientific documents. We fine-tuned PEGASUS, a pre-trained abstractive
+summarization model, to specifically summarize figure-referencing paragraphs
+(e.g., "Figure 3 shows...") into figure captions. Experiments on large-scale
+arXiv figures show that our method outperforms prior vision methods in both
+automatic and human evaluations. We further conducted an in-depth investigation
+focused on two key challenges: (i) the common presence of low-quality
+author-written captions and (ii) the lack of clear standards for good captions.
+Our code and data are available at:
+https://github.com/Crowd-AI-Lab/Generating-Figure-Captions-as-a-Text-Summarization-Task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by INLG-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models for Healthcare Data Augmentation: An Example on
+  Patient-Trial Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16756v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16756v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Yuan, Ruixiang Tang, Xiaoqian Jiang, Xia Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The process of matching patients with suitable clinical trials is essential
+for advancing medical research and providing optimal care. However, current
+approaches face challenges such as data standardization, ethical
+considerations, and a lack of interoperability between Electronic Health
+Records (EHRs) and clinical trial criteria. In this paper, we explore the
+potential of large language models (LLMs) to address these challenges by
+leveraging their advanced natural language generation capabilities to improve
+compatibility between EHRs and clinical trial descriptions. We propose an
+innovative privacy-aware data augmentation approach for LLM-based patient-trial
+matching (LLM-PTM), which balances the benefits of LLMs while ensuring the
+security and confidentiality of sensitive patient data. Our experiments
+demonstrate a 7.32% average improvement in performance using the proposed
+LLM-PTM method, and the generalizability to new data is improved by 12.12%.
+Additionally, we present case studies to further illustrate the effectiveness
+of our approach and provide a deeper understanding of its underlying
+principles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NTK-approximating MLP Fusion for Efficient Language Model Fine-tuning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxin Wei, Zeming Guo, Yifan Chen, Jingrui He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning a pre-trained language model (PLM) emerges as the predominant
+strategy in many natural language processing applications. However, even
+fine-tuning the PLMs and doing inference are expensive, especially on edge
+devices with low computing power. Some general approaches (e.g. quantization
+and distillation) have been widely studied to reduce the compute/memory of PLM
+fine-tuning, while very few one-shot compression techniques are explored. In
+this paper, we investigate the neural tangent kernel (NTK)--which reveals the
+gradient descent dynamics of neural networks--of the multilayer perceptrons
+(MLP) modules in a PLM and propose to coin a lightweight PLM through
+NTK-approximating MLP fusion. To achieve this, we reconsider the MLP as a
+bundle of sub-MLPs, and cluster them into a given number of centroids, which
+can then be restored as a compressed MLP and surprisingly shown to well
+approximate the NTK of the original PLM. Extensive experiments of PLM
+fine-tuning on both natural language understanding (NLU) and generation (NLG)
+tasks are provided to verify the effectiveness of the proposed method MLP
+fusion. Our code is available at https://github.com/weitianxin/MLP_Fusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Consistency Filtering-Free Unsupervised Learning for Dense
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxiang Shi, Sumio Fujita, Tetsuya Sakai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain transfer is a prevalent challenge in modern neural Information
+Retrieval (IR). To overcome this problem, previous research has utilized
+domain-specific manual annotations and synthetic data produced by consistency
+filtering to finetune a general ranker and produce a domain-specific ranker.
+However, training such consistency filters are computationally expensive, which
+significantly reduces the model efficiency. In addition, consistency filtering
+often struggles to identify retrieval intentions and recognize query and corpus
+distributions in a target domain. In this study, we evaluate a more efficient
+solution: replacing the consistency filter with either direct pseudo-labeling,
+pseudo-relevance feedback, or unsupervised keyword generation methods for
+achieving consistent filtering-free unsupervised dense retrieval. Our extensive
+experimental evaluations demonstrate that, on average, TextRank-based pseudo
+relevance feedback outperforms other methods. Furthermore, we analyzed the
+training and inference efficiency of the proposed paradigm. The results
+indicate that filtering-free unsupervised learning can continuously improve
+training and inference efficiency while maintaining retrieval performance. In
+some cases, it can even improve performance based on particular datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConvFormer: Revisiting <span class="highlight-title">Transformer</span> for Sequential User Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wang, Jianxun Lian, Mingqi Wu, Haoxuan Li, Jiajun Fan, Wanyue Xu, Chaozhuo Li, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential user modeling, a critical task in personalized recommender
+systems, focuses on predicting the next item a user would prefer, requiring a
+deep understanding of user behavior sequences. Despite the remarkable success
+of Transformer-based models across various domains, their full potential in
+comprehending user behavior remains untapped. In this paper, we re-examine
+Transformer-like architectures aiming to advance state-of-the-art performance.
+We start by revisiting the core building blocks of Transformer-based methods,
+analyzing the effectiveness of the item-to-item mechanism within the context of
+sequential user modeling. After conducting a thorough experimental analysis, we
+identify three essential criteria for devising efficient sequential user
+models, which we hope will serve as practical guidelines to inspire and shape
+future designs. Following this, we introduce ConvFormer, a simple but powerful
+modification to the Transformer architecture that meets these criteria,
+yielding state-of-the-art results. Additionally, we present an acceleration
+technique to minimize the complexity associated with processing extremely long
+sequences. Experiments on four public datasets showcase ConvFormer's
+superiority and confirm the validity of our proposed criteria.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Counterfactual Reasoning for Unbiased Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Ren, Xu Zhao, Hongyan Tang, Shuai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommender systems have achieved state-of-the-art recommendation
+performance by modeling the sequential dynamics of user activities. However, in
+most recommendation scenarios, the popular items comprise the major part of the
+previous user actions. Therefore, the learned models are biased towards the
+popular items irrespective of the user's real interests. In this paper, we
+propose a structural causal model-based method to address the popularity bias
+issue for sequential recommendation model learning. For more generalizable
+modeling, we disentangle the popularity and interest representations at both
+the item side and user context side. Based on the disentangled representation,
+we identify a more effective structural causal graph for general recommendation
+applications. Then, we design delicate sequential models to apply the
+aforementioned causal graph to the sequential recommendation scenario for
+unbiased prediction with counterfactual reasoning. Furthermore, we conduct
+extensive offline experiments and online A/B tests to verify the proposed
+\textbf{DCR} (Disentangled Counterfactual Reasoning) method's superior overall
+performance and understand the effectiveness of the various introduced
+components. Based on our knowledge, this is the first structural causal model
+specifically designed for the popularity bias correction of sequential
+recommendation models, which achieves significant performance gains over the
+existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Membership Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Vardasbi, Maarten de Rijke, Fernando Diaz, Mostafa Dehghani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When learning to rank from user interactions, search and recommendation
+systems must address biases in user behavior to provide a high-quality ranking.
+One type of bias that has recently been studied in the ranking literature is
+when sensitive attributes, such as gender, have an impact on a user's judgment
+about an item's utility. For example, in a search for an expertise area, some
+users may be biased towards clicking on male candidates over female candidates.
+We call this type of bias group membership bias or group bias for short.
+Increasingly, we seek rankings that not only have high utility but are also
+fair to individuals and sensitive groups. Merit-based fairness measures rely on
+the estimated merit or utility of the items. With group bias, the utility of
+the sensitive groups is under-estimated, hence, without correcting for this
+bias, a supposedly fair ranking is not truly fair. In this paper, first, we
+analyze the impact of group bias on ranking quality as well as two well-known
+merit-based fairness metrics and show that group bias can hurt both ranking and
+fairness. Then, we provide a correction method for group bias that is based on
+the assumption that the utility score of items in different groups comes from
+the same distribution. This assumption has two potential issues of sparsity and
+equality-instead-of-equity, which we use an amortized approach to solve. We
+show that our correction method can consistently compensate for the negative
+impact of group bias on ranking quality and fairness metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Replace Scoring with Arrangement: A Contextual Set-to-Arrangement
+  Framework for Learning-to-Rank <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarui Jin, Xianyu Chen, Weinan Zhang, Mengyue Yang, Yang Wang, Yali Du, Yong Yu, Jun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-to-rank is a core technique in the top-N recommendation task, where
+an ideal ranker would be a mapping from an item set to an arrangement (a.k.a.
+permutation). Most existing solutions fall in the paradigm of probabilistic
+ranking principle (PRP), i.e., first score each item in the candidate set and
+then perform a sort operation to generate the top ranking list. However, these
+approaches neglect the contextual dependence among candidate items during
+individual scoring, and the sort operation is non-differentiable. To bypass the
+above issues, we propose Set-To-Arrangement Ranking (STARank), a new framework
+directly generates the permutations of the candidate items without the need for
+individually scoring and sort operations; and is end-to-end differentiable. As
+a result, STARank can operate when only the ground-truth permutations are
+accessible without requiring access to the ground-truth relevance scores for
+items. For this purpose, STARank first reads the candidate items in the context
+of the user browsing history, whose representations are fed into a
+Plackett-Luce module to arrange the given items into a list. To effectively
+utilize the given ground-truth permutations for supervising STARank, we
+leverage the internal consistency property of Plackett-Luce models to derive a
+computationally efficient list-wise loss. Experimental comparisons against 9
+the state-of-the-art methods on 2 learning-to-rank benchmark datasets and 3
+top-N real-world recommendation datasets demonstrate the superiority of STARank
+in terms of conventional ranking metrics. Notice that these ranking metrics do
+not consider the effects of the contextual dependence among the items in the
+list, we design a new family of simulation-based ranking metrics, where
+existing metrics can be regarded as special cases. STARank can consistently
+achieve better performance in terms of PBM and UBM simulation-based metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bootstrapping Contrastive Learning Enhanced Music Cold-Start Matching <span class="chip">WWW'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinping Zhao, Ying Zhang, Qiang Xiao, Yuming Ren, Yingchun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a particular matching task we call Music Cold-Start Matching. In
+short, given a cold-start song request, we expect to retrieve songs with
+similar audiences and then fastly push the cold-start song to the audiences of
+the retrieved songs to warm up it. However, there are hardly any studies done
+on this task. Therefore, in this paper, we will formalize the problem of Music
+Cold-Start Matching detailedly and give a scheme. During the offline training,
+we attempt to learn high-quality song representations based on song content
+features. But, we find supervision signals typically follow power-law
+distribution causing skewed representation learning. To address this issue, we
+propose a novel contrastive learning paradigm named Bootstrapping Contrastive
+Learning (BCL) to enhance the quality of learned representations by exerting
+contrastive regularization. During the online serving, to locate the target
+audiences more accurately, we propose Clustering-based Audience Targeting (CAT)
+that clusters audience representations to acquire a few cluster centroids and
+then locate the target audiences by measuring the relevance between the
+audience representations and the cluster centroids. Extensive experiments on
+the offline dataset and online system demonstrate the effectiveness and
+efficiency of our method. Currently, we have deployed it on NetEase Cloud
+Music, affecting millions of users. Code will be released in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WWW'2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ kNN-Embed: Locally Smoothed Embedding Mixtures For Multi-interest
+  Candidate Retrieval <span class="chip">PAKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.06205v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.06205v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed El-Kishky, Thomas Markovich, Kenny Leung, Frank Portman, Aria Haghighi, Ying Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Candidate retrieval is the first stage in recommendation systems, where a
+light-weight system is used to retrieve potentially relevant items for an input
+user. These candidate items are then ranked and pruned in later stages of
+recommender systems using a more complex ranking model. As the top of the
+recommendation funnel, it is important to retrieve a high-recall candidate set
+to feed into downstream ranking models. A common approach is to leverage
+approximate nearest neighbor (ANN) search from a single dense query embedding;
+however, this approach this can yield a low-diversity result set with many near
+duplicates. As users often have multiple interests, candidate retrieval should
+ideally return a diverse set of candidates reflective of the user's multiple
+interests. To this end, we introduce kNN-Embed, a general approach to improving
+diversity in dense ANN-based retrieval. kNN-Embed represents each user as a
+smoothed mixture over learned item clusters that represent distinct "interests"
+of the user. By querying each of a user's mixture component in proportion to
+their mixture weights, we retrieve a high-diversity set of candidates
+reflecting elements from each of a user's interests. We experimentally compare
+kNN-Embed to standard ANN candidate retrieval, and show significant
+improvements in overall recall and improved diversity across three datasets.
+Accompanying this work, we open source a large Twitter follow-graph dataset
+(https://huggingface.co/datasets/Twitter/TwitterFollowGraph), to spur further
+research in graph-mining and representation learning for recommender systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pacific-Asia Conference on Knowledge Discovery and Data Mining. Cham:
+  Springer Nature Switzerland, 2023 (PAKDD 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Meta-Evaluation of C/W/L/A Metrics: System Ranking Similarity, System
+  Ranking Consistency and Discriminative Power 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02936v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02936v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuo Chen, Tetsuya Sakai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Moffat et al. proposed an analytic framework, namely C/W/L/A, for
+offline evaluation metrics. This framework allows information retrieval (IR)
+researchers to design evaluation metrics through the flexible combination of
+user browsing models and user gain aggregations. However, the statistical
+stability of C/W/L/A metrics with different aggregations is not yet
+investigated. In this study, we investigate the statistical stability of
+C/W/L/A metrics from the perspective of: (1) the system ranking similarity
+among aggregations, (2) the system ranking consistency of aggregations and (3)
+the discriminative power of aggregations. More specifically, we combined
+various aggregation functions with the browsing model of Precision, Discounted
+Cumulative Gain (DCG), Rank-Biased Precision (RBP), INST, Average Precision
+(AP) and Expected Reciprocal Rank (ERR), examing their performances in terms of
+system ranking similarity, system ranking consistency and discriminative power
+on two offline test collections. Our experimental result suggests that, in
+terms of system ranking consistency and discriminative power, the aggregation
+function of expected rate of gain (ERG) has an outstanding performance while
+the aggregation function of maximum relevance usually has an insufficient
+performance. The result also suggests that Precision, DCG, RBP, INST and AP
+with their canonical aggregation all have favourable performances in system
+ranking consistency and discriminative power; but for ERR, replacing its
+canonical aggregation with ERG can further strengthen the discriminative power
+while obtaining a system ranking list similar to the canonical version at the
+same time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recommender Systems in the Era of Large Language Models (LLMs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Fan, Zihuai Zhao, Jiatong Li, Yunqing Liu, Xiaowei Mei, Yiqi Wang, Zhen Wen, Fei Wang, Xiangyu Zhao, Jiliang Tang, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prosperity of e-commerce and web applications, Recommender Systems
+(RecSys) have become an important component of our daily life, providing
+personalized suggestions that cater to user preferences. While Deep Neural
+Networks (DNNs) have made significant advancements in enhancing recommender
+systems by modeling user-item interactions and incorporating textual side
+information, DNN-based methods still face limitations, such as difficulties in
+understanding users' interests and capturing textual side information,
+inabilities in generalizing to various recommendation scenarios and reasoning
+on their predictions, etc. Meanwhile, the emergence of Large Language Models
+(LLMs), such as ChatGPT and GPT4, has revolutionized the fields of Natural
+Language Processing (NLP) and Artificial Intelligence (AI), due to their
+remarkable abilities in fundamental responsibilities of language understanding
+and generation, as well as impressive generalization and reasoning
+capabilities. As a result, recent studies have attempted to harness the power
+of LLMs to enhance recommender systems. Given the rapid evolution of this
+research direction in recommender systems, there is a pressing need for a
+systematic overview that summarizes existing LLM-empowered recommender systems,
+to provide researchers in relevant fields with an in-depth understanding.
+Therefore, in this paper, we conduct a comprehensive review of LLM-empowered
+recommender systems from various aspects including Pre-training, Fine-tuning,
+and Prompting. More specifically, we first introduce representative methods to
+harness the power of LLMs (as a feature encoder) for learning representations
+of users and items. Then, we review recent techniques of LLMs for enhancing
+recommender systems from three paradigms, namely pre-training, fine-tuning, and
+prompting. Finally, we comprehensively discuss future directions in this
+emerging field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Purchase Behavior: Modeling, Estimation and Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2006.08055v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2006.08055v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theja Tulabandhula, Deeksha Sinha, Saketh Reddy Karra, Prasoon Patidar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of modeling purchase of multiple products and utilizing
+it to display optimized recommendations for online retailers and e-commerce
+platforms.
+  We present a parsimonious multi-purchase family of choice models called the
+Bundle-MVL-K family, and develop a binary search based iterative strategy that
+efficiently computes optimized recommendations for this model. We establish the
+hardness of computing optimal recommendation sets, and derive several
+structural properties of the optimal solution that aid in speeding up
+computation. This is one of the first attempts at operationalizing
+multi-purchase class of choice models. We show one of the first quantitative
+links between modeling multiple purchase behavior and revenue gains. The
+efficacy of our modeling and optimization techniques compared to competing
+solutions is shown using several real world datasets on multiple metrics such
+as model fitness, expected revenue gains and run-time reductions. For example,
+the expected revenue benefit of taking multiple purchases into account is
+observed to be $\sim5\%$ in relative terms for the Ta Feng and UCI shopping
+datasets, when compared to the MNL model for instances with $\sim 1500$
+products. Additionally, across $6$ real world datasets, the test log-likelihood
+fits of our models are on average $17\%$ better in relative terms. Our work
+contributes to the study multi-purchase decisions, analyzing consumer demand
+and the retailers optimization problem. The simplicity of our models and the
+iterative nature of our optimization technique allows practitioners meet
+stringent computational constraints while increasing their revenues in
+practical recommendation applications at scale, especially in e-commerce
+platforms and other marketplaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages. Published in Manufacturing & Service Operations Management
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FAST: Font-Agnostic Scene Text Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alloy Das, Prasun Roy, Saumik Bhattacharya, Subhankar Ghosh, Umapada Pal, Michael Blumenstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Text Editing (STE) is a challenging research problem, and it aims to
+modify existing texts in an image while preserving the background and the font
+style of the original text of the image. Due to its various real-life
+applications, researchers have explored several approaches toward STE in recent
+years. However, most of the existing STE methods show inferior editing
+performance because of (1) complex image backgrounds, (2) various font styles,
+and (3) varying word lengths within the text. To address such inferior editing
+performance issues, in this paper, we propose a novel font-agnostic scene text
+editing framework, named FAST, for simultaneously generating text in arbitrary
+styles and locations while preserving a natural and realistic appearance
+through combined mask generation and style transfer. The proposed approach
+differs from the existing methods as they directly modify all image pixels.
+Instead, the proposed method has introduced a filtering mechanism to remove
+background distractions, allowing the network to focus solely on the text
+regions where editing is required. Additionally, a text-style transfer module
+has been designed to mitigate the challenges posed by varying word lengths.
+Extensive experiments and ablations have been conducted, and the results
+demonstrate that the proposed method outperforms the existing methods both
+qualitatively and quantitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, in submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sketch and Text Guided Diffusion Model for Colored Point Cloud
+  Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Wu, Yaonan Wang, Mingtao Feng, He Xie, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion probabilistic models have achieved remarkable success in text
+guided image generation. However, generating 3D shapes is still challenging due
+to the lack of sufficient data containing 3D models along with their
+descriptions. Moreover, text based descriptions of 3D shapes are inherently
+ambiguous and lack details. In this paper, we propose a sketch and text guided
+probabilistic diffusion model for colored point cloud generation that
+conditions the denoising process jointly with a hand drawn sketch of the object
+and its textual description. We incrementally diffuse the point coordinates and
+color values in a joint diffusion process to reach a Gaussian distribution.
+Colored point cloud generation thus amounts to learning the reverse diffusion
+process, conditioned by the sketch and text, to iteratively recover the desired
+shape and color. Specifically, to learn effective sketch-text embedding, our
+model adaptively aggregates the joint embedding of text prompt and the sketch
+based on a capsule attention network. Our model uses staged diffusion to
+generate the shape and then assign colors to different parts conditioned on the
+appearance prompt while preserving precise shapes from the first stage. This
+gives our model the flexibility to extend to multiple tasks, such as appearance
+re-editing and part segmentation. Experimental results demonstrate that our
+model outperforms recent state-of-the-art in point cloud generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>CARE: <span class="highlight-title">Prompt</span> Copyright Protection by Watermark Injection and
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongwei Yao, Jian Lou, Kui Ren, Zhan Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have witnessed a meteoric rise in popularity
+among the general public users over the past few months, facilitating diverse
+downstream tasks with human-level accuracy and proficiency. Prompts play an
+essential role in this success, which efficiently adapt pre-trained LLMs to
+task-specific applications by simply prepending a sequence of tokens to the
+query texts. However, designing and selecting an optimal prompt can be both
+expensive and demanding, leading to the emergence of Prompt-as-a-Service
+providers who profit by providing well-designed prompts for authorized use.
+With the growing popularity of prompts and their indispensable role in
+LLM-based services, there is an urgent need to protect the copyright of prompts
+against unauthorized use.
+  In this paper, we propose PromptCARE, the first framework for prompt
+copyright protection through watermark injection and verification. Prompt
+watermarking presents unique challenges that render existing watermarking
+techniques developed for model and dataset copyright verification ineffective.
+PromptCARE overcomes these hurdles by proposing watermark injection and
+verification schemes tailor-made for prompts and NLP characteristics. Extensive
+experiments on six well-known benchmark datasets, using three prevalent
+pre-trained LLMs (BERT, RoBERTa, and Facebook OPT-1.3b), demonstrate the
+effectiveness, harmlessness, robustness, and stealthiness of PromptCARE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with
+  Diffusion Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17550v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17550v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenpng Du, Qi Chen, Tianyu He, Xu Tan, Xie Chen, Kai Yu, Sheng Zhao, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent research has made significant progress in speech-driven talking
+face generation, the quality of the generated video still lags behind that of
+real recordings. One reason for this is the use of handcrafted intermediate
+representations like facial landmarks and 3DMM coefficients, which are designed
+based on human knowledge and are insufficient to precisely describe facial
+movements. Additionally, these methods require an external pretrained model for
+extracting these representations, whose performance sets an upper bound on
+talking face generation. To address these limitations, we propose a novel
+method called DAE-Talker that leverages data-driven latent representations
+obtained from a diffusion autoencoder (DAE). DAE contains an image encoder that
+encodes an image into a latent vector and a DDIM image decoder that
+reconstructs the image from it. We train our DAE on talking face video frames
+and then extract their latent representations as the training target for a
+Conformer-based speech2latent model. This allows DAE-Talker to synthesize full
+video frames and produce natural head movements that align with the content of
+speech, rather than relying on a predetermined head pose from a template video.
+We also introduce pose modelling in speech2latent for pose controllability.
+Additionally, we propose a novel method for generating continuous video frames
+with the DDIM image decoder trained on individual frames, eliminating the need
+for modelling the joint distribution of consecutive frames directly. Our
+experiments show that DAE-Talker outperforms existing popular methods in
+lip-sync, video fidelity, and pose naturalness. We also conduct ablation
+studies to analyze the effectiveness of the proposed techniques and demonstrate
+the pose controllability of DAE-Talker.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-04T00:00:00Z">2023-08-04</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">44</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MM-Vet, an evaluation benchmark that examines large multimodal
+models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various
+intriguing abilities, such as solving math problems written on the blackboard,
+reasoning about events and celebrities in news images, and explaining visual
+jokes. Rapid model advancements pose challenges to evaluation benchmark
+development. Problems include: (1) How to systematically structure and evaluate
+the complicated multimodal tasks; (2) How to design evaluation metrics that
+work well across question and answer types; and (3) How to give model insights
+beyond a simple performance ranking. To this end, we present MM-Vet, designed
+based on the insight that the intriguing ability to solve complicated tasks is
+often achieved by a generalist model being able to integrate different core
+vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and
+examines the 16 integrations of interest derived from the capability
+combination. For evaluation metrics, we propose an LLM-based evaluator for
+open-ended outputs. The evaluator enables the evaluation across different
+question types and answer styles, resulting in a unified scoring metric. We
+evaluate representative LMMs on MM-Vet, providing insights into the
+capabilities of different LMM system paradigms and models. Code and data are
+available at https://github.com/yuweihao/MM-Vet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data: https://github.com/yuweihao/MM-Vet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting the NICT-JLE Corpus for Disfluency Detection Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucy Skidmore, Roger K. Moore
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection of disfluencies such as hesitations, repetitions and false
+starts commonly found in speech is a widely studied area of research. With a
+standardised process for evaluation using the Switchboard Corpus, model
+performance can be easily compared across approaches. This is not the case for
+disfluency detection research on learner speech, however, where such datasets
+have restricted access policies, making comparison and subsequent development
+of improved models more challenging. To address this issue, this paper
+describes the adaptation of the NICT-JLE corpus, containing approximately 300
+hours of English learners' oral proficiency tests, to a format that is suitable
+for disfluency detection model training and evaluation. Points of difference
+between the NICT-JLE and Switchboard corpora are explored, followed by a
+detailed overview of adaptations to the tag set and meta-features of the
+NICT-JLE corpus. The result of this work provides a standardised train, heldout
+and test set for use in future research on disfluency detection for learner
+speech.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalist Foundation Model for Radiology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we aim to initiate the development of Radiology Foundation
+Model, termed as RadFM.We consider the construction of foundational models from
+the perspectives of data, model design, and evaluation thoroughly. Our
+contribution can be concluded as follows: (i), we construct a large-scale
+Medical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.
+To the best of our knowledge, this is the first multi-modal dataset containing
+3D medical scans. (ii), We propose an architecture that enables visually
+conditioned generative pre-training, allowing for the integration of text input
+interleaved with 2D or 3D medical scans to generate response for diverse
+radiologic tasks. The model was initially pre-trained on MedMD and subsequently
+domain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,
+containing 3M radiologic visual-language pairs. (iii), we propose a new
+evaluation benchmark that comprises five tasks, aiming to comprehensively
+assess the capability of foundation models in handling practical clinical
+problems. Our experimental results confirm that RadFM significantly outperforms
+existing multi-modal foundation models. The codes, data, and model checkpoint
+will all be made publicly available to promote further research and development
+in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Military to Healthcare: Adopting and Expanding Ethical Principles
+  for Generative Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Oniani, Jordan Hilsman, Yifan Peng,  COL, Ronald K. Poropatich, COL Jeremy C. Pamplin, LTC Gary L. Legault, Yanshan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 2020, the U.S. Department of Defense officially disclosed a set of ethical
+principles to guide the use of Artificial Intelligence (AI) technologies on
+future battlefields. Despite stark differences, there are core similarities
+between the military and medical service. Warriors on battlefields often face
+life-altering circumstances that require quick decision-making. Medical
+providers experience similar challenges in a rapidly changing healthcare
+environment, such as in the emergency department or during surgery treating a
+life-threatening condition. Generative AI, an emerging technology designed to
+efficiently generate valuable information, holds great promise. As computing
+power becomes more accessible and the abundance of health data, such as
+electronic health records, electrocardiograms, and medical images, increases,
+it is inevitable that healthcare will be revolutionized by this technology.
+Recently, generative AI has captivated the research community, leading to
+debates about its application in healthcare, mainly due to concerns about
+transparency and related issues. Meanwhile, concerns about the potential
+exacerbation of health disparities due to modeling biases have raised notable
+ethical concerns regarding the use of this technology in healthcare. However,
+the ethical principles for generative AI in healthcare have been understudied,
+and decision-makers often fail to consider the significance of generative AI.
+In this paper, we propose GREAT PLEA ethical principles, encompassing
+governance, reliability, equity, accountability, traceability, privacy,
+lawfulness, empathy, and autonomy, for generative AI in healthcare. We aim to
+proactively address the ethical dilemmas and challenges posed by the
+integration of generative AI in healthcare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text2KGBench: A Benchmark for Ontology-Driven Knowledge Graph Generation
+  from Text <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nandana Mihindukulasooriya, Sanju Tiwari, Carlos F. Enguix, Kusum Lata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advances in large language models (LLM) and foundation models with
+emergent capabilities have been shown to improve the performance of many NLP
+tasks. LLMs and Knowledge Graphs (KG) can complement each other such that LLMs
+can be used for KG construction or completion while existing KGs can be used
+for different tasks such as making LLM outputs explainable or fact-checking in
+Neuro-Symbolic manner. In this paper, we present Text2KGBench, a benchmark to
+evaluate the capabilities of language models to generate KGs from natural
+language text guided by an ontology. Given an input ontology and a set of
+sentences, the task is to extract facts from the text while complying with the
+given ontology (concepts, relations, domain/range constraints) and being
+faithful to the input sentences. We provide two datasets (i) Wikidata-TekGen
+with 10 ontologies and 13,474 sentences and (ii) DBpedia-WebNLG with 19
+ontologies and 4,860 sentences. We define seven evaluation metrics to measure
+fact extraction performance, ontology conformance, and hallucinations by LLMs.
+Furthermore, we provide results for two baseline models, Vicuna-13B and
+Alpaca-LoRA-13B using automatic prompt generation from test cases. The baseline
+results show that there is room for improvement using both Semantic Web and
+Natural Language Processing techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, 4 tables. Accepted at ISWC 2023 (Resources
+  Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dataflow Dialogue Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joram Meron, Victor Guimarães
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate task-oriented dialogue generation within the dataflow dialogue
+paradigm. We show an example of agenda driven dialogue generation for the
+MultiWOZ domain, and an example of generation without an agenda for the
+SMCalFlow domain, where we show an improvement in the accuracy of the
+translation of user requests to dataflow expressions when the generated
+dialogues are used to augment the translation training dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Select the Relevant History Turns in Conversational Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Munazza Zaib, Wei Emma Zhang, Quan Z. Sheng, Subhash Sagar, Adnan Mahmood, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for the web-based digital assistants has given a rapid
+rise in the interest of the Information Retrieval (IR) community towards the
+field of conversational question answering (ConvQA). However, one of the
+critical aspects of ConvQA is the effective selection of conversational history
+turns to answer the question at hand. The dependency between relevant history
+selection and correct answer prediction is an intriguing but under-explored
+area. The selected relevant context can better guide the system so as to where
+exactly in the passage to look for an answer. Irrelevant context, on the other
+hand, brings noise to the system, thereby resulting in a decline in the model's
+performance. In this paper, we propose a framework, DHS-ConvQA (Dynamic History
+Selection in Conversational Question Answering), that first generates the
+context and question entities for all the history turns, which are then pruned
+on the basis of similarity they share in common with the question at hand. We
+also propose an attention-based mechanism to re-rank the pruned terms based on
+their calculated weights of how useful they are in answering the question. In
+the end, we further aid the model by highlighting the terms in the re-ranked
+conversational history using a binary classification task and keeping the
+useful terms (predicted as 1) and ignoring the irrelevant terms (predicted as
+0). We demonstrate the efficacy of our proposed framework with extensive
+experimental results on CANARD and QuAC -- the two popularly utilized datasets
+in ConvQA. We demonstrate that selecting relevant turns works better than
+rewriting the original question. We also investigate how adding the irrelevant
+history turns negatively impacts the model's performance and discuss the
+research challenges that demand more attention from the IR community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redundancy Aware Multi-Reference Based Gainwise Evaluation of Extractive
+  Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mousumi Akter, Shubhra Kanti Karmaker Santu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While very popular for evaluating extractive summarization task, the ROUGE
+metric has long been criticized for its lack of semantic awareness and its
+ignorance about the ranking quality of the summarizer. Thanks to previous
+research that has addressed these issues by proposing a gain-based automated
+metric called Sem-nCG, which is both rank and semantic aware. However, Sem-nCG
+does not consider the amount of redundancy present in a model-generated summary
+and currently does not support evaluation with multiple reference summaries.
+Unfortunately, addressing both these limitations simultaneously is not trivial.
+Therefore, in this paper, we propose a redundancy-aware Sem-nCG metric and
+demonstrate how this new metric can be used to evaluate model summaries against
+multiple references. We also explore different ways of incorporating redundancy
+into the original metric through extensive experiments. Experimental results
+demonstrate that the new redundancy-aware metric exhibits a higher correlation
+with human judgments than the original Sem-nCG metric for both single and
+multiple reference scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Monaural Speech Enhancement using Spectrum Attention Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyu Long, Jetic Gū, Binhao Bai, Zhibo Yang, Ping Wei, Junli Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech enhancement is a demanding task in automated speech processing
+pipelines, focusing on separating clean speech from noisy channels. Transformer
+based models have recently bested RNN and CNN models in speech enhancement,
+however at the same time they are much more computationally expensive and
+require much more high quality training data, which is always hard to come by.
+In this paper, we present an improvement for speech enhancement models that
+maintains the expressiveness of self-attention while significantly reducing
+model complexity, which we have termed Spectrum Attention Fusion. We carefully
+construct a convolutional module to replace several self-attention layers in a
+speech Transformer, allowing the model to more efficiently fuse spectral
+features. Our proposed model is able to achieve comparable or better results
+against SOTA models but with significantly smaller parameters (0.58M) on the
+Voice Bank + DEMAND dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sinhala-English Parallel Word Dictionary <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kasun Wickramasinghe, Nisansa de Silva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parallel datasets are vital for performing and evaluating any kind of
+multilingual task. However, in the cases where one of the considered language
+pairs is a low-resource language, the existing top-down parallel data such as
+corpora are lacking in both tally and quality due to the dearth of human
+annotation. Therefore, for low-resource languages, it is more feasible to move
+in the bottom-up direction where finer granular pairs such as dictionary
+datasets are developed first. They may then be used for mid-level tasks such as
+supervised multilingual word embedding alignment. These in turn can later guide
+higher-level tasks in the order of aligning sentence or paragraph text corpora
+used for Machine Translation (MT). Even though more approachable than
+generating and aligning a massive corpus for a low-resource language, for the
+same reason of apathy from larger research entities, even these finer granular
+data sets are lacking for some low-resource languages. We have observed that
+there is no free and open dictionary data set for the low-resource language,
+Sinhala. Thus, in this work, we introduce three parallel English-Sinhala word
+dictionaries (En-Si-dict-large, En-Si-dict-filtered, En-Si-dict-FastText) which
+help in multilingual Natural Language Processing (NLP) tasks related to English
+and Sinhala languages. In this paper, we explain the dataset creation pipeline
+as well as the experimental results of the tests we have carried out to verify
+the quality of the data sets. The data sets and the related scripts are
+available at https://github.com/kasunw22/sinhala-para-dict.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Paraphrase Sentences to Different Complexity Levels <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alison Chi, Li-Kuang Chen, Yi-Chen Chang, Shu-Hui Lee, Jason S. Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While sentence simplification is an active research topic in NLP, its
+adjacent tasks of sentence complexification and same-level paraphrasing are
+not. To train models on all three tasks, we present two new unsupervised
+datasets. We compare these datasets, one labeled by a weak classifier and the
+other by a rule-based approach, with a single supervised dataset. Using these
+three datasets for training, we perform extensive experiments on both
+multitasking and prompting strategies. Compared to other systems trained on
+unsupervised parallel data, models trained on our weak classifier labeled
+dataset achieve state-of-the-art performance on the ASSET simplification
+benchmark. Our models also outperform previous work on sentence level
+targeting. Finally, we establish how a handful of Large Language Models perform
+on these tasks under a zero-shot setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This arXiv version is a pre-MIT Press publication version, this paper
+  has been accepted by TACL. 22 pages, 3 figures, 13 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ESRL: Efficient Sampling-based Reinforcement Learning for Sequence
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenglong Wang, Hang Zhou, Yimin Hu, Yifu Huo, Bei Li, Tongran Liu, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Applying Reinforcement Learning (RL) to sequence generation models enables
+the direct optimization of long-term rewards (\textit{e.g.,} BLEU and human
+feedback), but typically requires large-scale sampling over a space of action
+sequences. This is a computational challenge as presented by the practice of
+sequence generation problems, such as machine translation, where we often deal
+with a large action space (\textit{e.g.,} a vocabulary) and a long action
+sequence (\textit{e.g.,} a translation). In this work, we introduce two-stage
+sampling and dynamic sampling approaches to improve the sampling efficiency
+during training sequence generation models via RL. We experiment with our
+approaches on the traditional sequence generation tasks, including machine
+translation and abstractive summarization. Furthermore, we evaluate our
+approaches in RL from human feedback (RLHF) through training a large language
+model using the reward model. Experimental results show that the efficient
+sampling-based RL, referred to as ESRL, can outperform all baselines in terms
+of both training efficiency and memory consumption. Notably, ESRL yields
+consistent performance gains over the strong REINFORCE, minimum risk training,
+and proximal policy optimization methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Spanish Clinical Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillem García Subies, Álvaro Barbero Jiménez, Paloma Martínez Fernández
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey focuses in encoder Language Models for solving tasks in the
+clinical domain in the Spanish language. We review the contributions of 17
+corpora focused mainly in clinical tasks, then list the most relevant Spanish
+Language Models and Spanish Clinical Language models. We perform a thorough
+comparison of these models by benchmarking them over a curated subset of the
+available corpora, in order to find the best-performing ones; in total more
+than 3000 models were fine-tuned for this study. All the tested corpora and the
+best models are made publically available in an accessible way, so that the
+results can be reproduced by independent teams or challenged in the future when
+new Spanish Clinical Language models are created.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Relation Classification Models with Semantic Extents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Klöser, Andre Büsgen, Philipp Kohl, Bodo Kraft, Albert Zündorf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the development of large pretrained language models, such as
+BERT and GPT, significantly improved information extraction systems on various
+tasks, including relation classification. State-of-the-art systems are highly
+accurate on scientific benchmarks. A lack of explainability is currently a
+complicating factor in many real-world applications. Comprehensible systems are
+necessary to prevent biased, counterintuitive, or harmful decisions.
+  We introduce semantic extents, a concept to analyze decision patterns for the
+relation classification task. Semantic extents are the most influential parts
+of texts concerning classification decisions. Our definition allows similar
+procedures to determine semantic extents for humans and models. We provide an
+annotation tool and a software framework to determine semantic extents for
+humans and models conveniently and reproducibly. Comparing both reveals that
+models tend to learn shortcut patterns from data. These patterns are hard to
+detect with current interpretability methods, such as input reductions. Our
+approach can help detect and eliminate spurious decision patterns during model
+development. Semantic extents can increase the reliability and security of
+natural language processing systems. Semantic extents are an essential step in
+enabling applications in critical areas like healthcare or finance. Moreover,
+our work opens new research directions for developing methods to explain deep
+learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at DeLTA 2023: Deep Learning Theory and Applications
+  conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emo-DNA: Emotion Decoupling and Alignment Learning for Cross-Corpus
+  Speech Emotion Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Ye, Yujie Wei, Xin-Cheng Wen, Chenglong Ma, Zhizhong Huang, Kunhong Liu, Hongming Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-corpus speech emotion recognition (SER) seeks to generalize the ability
+of inferring speech emotion from a well-labeled corpus to an unlabeled one,
+which is a rather challenging task due to the significant discrepancy between
+two corpora. Existing methods, typically based on unsupervised domain
+adaptation (UDA), struggle to learn corpus-invariant features by global
+distribution alignment, but unfortunately, the resulting features are mixed
+with corpus-specific features or not class-discriminative. To tackle these
+challenges, we propose a novel Emotion Decoupling aNd Alignment learning
+framework (EMO-DNA) for cross-corpus SER, a novel UDA method to learn
+emotion-relevant corpus-invariant features. The novelties of EMO-DNA are
+two-fold: contrastive emotion decoupling and dual-level emotion alignment. On
+one hand, our contrastive emotion decoupling achieves decoupling learning via a
+contrastive decoupling loss to strengthen the separability of emotion-relevant
+features from corpus-specific ones. On the other hand, our dual-level emotion
+alignment introduces an adaptive threshold pseudo-labeling to select confident
+target samples for class-level alignment, and performs corpus-level alignment
+to jointly guide model for learning class-discriminative corpus-invariant
+features across corpora. Extensive experimental results demonstrate the
+superior performance of EMO-DNA over the state-of-the-art methods in several
+cross-corpus scenarios. Source code is available at
+https://github.com/Jiaxin-Ye/Emo-DNA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Fake to Hyperpartisan News Detection Using Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Răzvan-Alexandru Smădu, Sebastian-Vasile Echim, Dumitru-Clementin Cercel, Iuliana Marin, Florin Pop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation (UDA) is a popular technique that aims to
+reduce the domain shift between two data distributions. It was successfully
+applied in computer vision and natural language processing. In the current
+work, we explore the effects of various unsupervised domain adaptation
+techniques between two text classification tasks: fake and hyperpartisan news
+detection. We investigate the knowledge transfer from fake to hyperpartisan
+news detection without involving target labels during training. Thus, we
+evaluate UDA, cluster alignment with a teacher, and cross-domain contrastive
+learning. Extensive experiments show that these techniques improve performance,
+while including data augmentation further enhances the results. In addition, we
+combine clustering and topic modeling algorithms with UDA, resulting in
+improved performances compared to the initial UDA setup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, Accepted by RANLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case
+  Study in Oncology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cliff Wong, Sheng Zheng, Yu Gu, Christine Moung, Jacob Abel, Naoto Usuyama, Roshanthi Weerasinghe, Brian Piening, Tristan Naumann, Carlo Bifulco, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trial matching is a key process in health delivery and discovery. In
+practice, it is plagued by overwhelming unstructured data and unscalable manual
+processing. In this paper, we conduct a systematic study on scaling clinical
+trial matching using large language models (LLMs), with oncology as the focus
+area. Our study is grounded in a clinical trial matching system currently in
+test deployment at a large U.S. health network. Initial findings are promising:
+out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate
+eligibility criteria of clinical trials and extract complex matching logic
+(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially
+outperform prior strong baselines and may serve as a preliminary solution to
+help triage patient-trial candidates with humans in the loop. Our study also
+reveals a few significant growth areas for applying LLMs to end-to-end clinical
+trial matching, such as context limitation and accuracy, especially in
+structuring patient information from longitudinal medical records.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted at Machine Learning for Healthcare
+  (MLHC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ You talk what you read: Understanding News Comment Behavior by
+  Dispositional and Situational Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Wang, Yuxiang Zhang, Dongyuan Lu, Jitao Sang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many news comment mining studies are based on the assumption that comment is
+explicitly linked to the corresponding news. In this paper, we observed that
+users' comments are also heavily influenced by their individual characteristics
+embodied by the interaction history. Therefore, we position to understand news
+comment behavior by considering both the dispositional factors from news
+interaction history, and the situational factors from corresponding news. A
+three-part encoder-decoder framework is proposed to model the generative
+process of news comment. The resultant dispositional and situational
+attribution contributes to understanding user focus and opinions, which are
+validated in applications of reader-aware news summarization and news
+aspect-opinion forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speaker Diarization of Scripted Audiovisual Content 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yogesh Virkar, Brian Thompson, Rohit Paturi, Sundararajan Srinivasan, Marcello Federico
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The media localization industry usually requires a verbatim script of the
+final film or TV production in order to create subtitles or dubbing scripts in
+a foreign language. In particular, the verbatim script (i.e. as-broadcast
+script) must be structured into a sequence of dialogue lines each including
+time codes, speaker name and transcript. Current speech recognition technology
+alleviates the transcription step. However, state-of-the-art speaker
+diarization models still fall short on TV shows for two main reasons: (i) their
+inability to track a large number of speakers, (ii) their low accuracy in
+detecting frequent speaker changes. To mitigate this problem, we present a
+novel approach to leverage production scripts used during the shooting process,
+to extract pseudo-labeled data for the speaker diarization task. We propose a
+novel semi-supervised approach and demonstrate improvements of 51.7% relative
+to two unsupervised baseline models on our metrics on a 66 show test set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retroformer: Retrospective Large Language Agents with Policy Gradient
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiran Yao, Shelby Heinecke, Juan Carlos Niebles, Zhiwei Liu, Yihao Feng, Le Xue, Rithesh Murthy, Zeyuan Chen, Jianguo Zhang, Devansh Arpit, Ran Xu, Phil Mui, Huan Wang, Caiming Xiong, Silvio Savarese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent months have seen the emergence of a powerful new trend in which large
+language models (LLMs) are augmented to become autonomous language agents
+capable of performing objective oriented multi-step tasks on their own, rather
+than merely responding to queries from human users. Most existing language
+agents, however, are not optimized using environment-specific rewards. Although
+some agents enable iterative refinement through verbal feedback, they do not
+reason and plan in ways that are compatible with gradient-based learning from
+rewards. This paper introduces a principled framework for reinforcing large
+language agents by learning a retrospective model, which automatically tunes
+the language agent prompts from environment feedback through policy gradient.
+Specifically, our proposed agent architecture learns from rewards across
+multiple environments and tasks, for fine-tuning a pre-trained language model
+which refines the language agent prompt by summarizing the root cause of prior
+failed attempts and proposing action plans. Experimental results on various
+tasks demonstrate that the language agents improve over time and that our
+approach considerably outperforms baselines that do not properly leverage
+gradients from the environment. This demonstrates that using policy gradient
+optimization to improve language agents, for which we believe our work is one
+of the first, seems promising and can be applied to optimize other models in
+the agent architecture to enhance agent performances over time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tweet Insights: A Visualization Platform to Extract Temporal Insights
+  from Twitter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Loureiro, Kiamehr Rezaee, Talayeh Riahi, Francesco Barbieri, Leonardo Neves, Luis Espinosa Anke, Jose Camacho-Collados
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a large collection of time series data derived from
+Twitter, postprocessed using word embedding techniques, as well as specialized
+fine-tuned language models. This data comprises the past five years and
+captures changes in n-gram frequency, similarity, sentiment and topic
+distribution. The interface built on top of this data enables temporal analysis
+for detecting and characterizing shifts in meaning, including complementary
+information to trending metrics, such as sentiment and topic association over
+time. We release an online demo for easy experimentation, and we share code and
+the underlying aggregated data for future work. In this paper, we also discuss
+three case studies unlocked thanks to our platform, showcasing its potential
+for temporal linguistic analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Demo paper. Visualization platform available at
+  https://tweetnlp.org/insights</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ParaFuzz: An Interpretability-Driven Technique for Detecting Poisoned
+  Samples in NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Yan, Zhuo Zhang, Guanhong Tao, Kaiyuan Zhang, Xuan Chen, Guangyu Shen, Xiangyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks have emerged as a prominent threat to natural language
+processing (NLP) models, where the presence of specific triggers in the input
+can lead poisoned models to misclassify these inputs to predetermined target
+classes. Current detection mechanisms are limited by their inability to address
+more covert backdoor strategies, such as style-based attacks. In this work, we
+propose an innovative test-time poisoned sample detection framework that hinges
+on the interpretability of model predictions, grounded in the semantic meaning
+of inputs. We contend that triggers (e.g., infrequent words) are not supposed
+to fundamentally alter the underlying semantic meanings of poisoned samples as
+they want to stay stealthy. Based on this observation, we hypothesize that
+while the model's predictions for paraphrased clean samples should remain
+stable, predictions for poisoned samples should revert to their true labels
+upon the mutations applied to triggers during the paraphrasing process. We
+employ ChatGPT, a state-of-the-art large language model, as our paraphraser and
+formulate the trigger-removal task as a prompt engineering problem. We adopt
+fuzzing, a technique commonly used for unearthing software vulnerabilities, to
+discover optimal paraphrase prompts that can effectively eliminate triggers
+while concurrently maintaining input semantics. Experiments on 4 types of
+backdoor attacks, including the subtle style backdoors, and 4 distinct datasets
+demonstrate that our approach surpasses baseline methods, including STRIP, RAP,
+and ONION, in precision and recall.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chinese Financial Text Emotion Mining: GCGTS -- A Character
+  Relationship-based Approach for Simultaneous Aspect-Opinion Pair Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Chen, Dexi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aspect-Opinion Pair Extraction (AOPE) from Chinese financial texts is a
+specialized task in fine-grained text sentiment analysis. The main objective is
+to extract aspect terms and opinion terms simultaneously from a diverse range
+of financial texts. Previous studies have mainly focused on developing grid
+annotation schemes within grid-based models to facilitate this extraction
+process. However, these methods often rely on character-level (token-level)
+feature encoding, which may overlook the logical relationships between Chinese
+characters within words. To address this limitation, we propose a novel method
+called Graph-based Character-level Grid Tagging Scheme (GCGTS). The GCGTS
+method explicitly incorporates syntactic structure using Graph Convolutional
+Networks (GCN) and unifies the encoding of characters within the same syntactic
+semantic unit (Chinese word level). Additionally, we introduce an image
+convolutional structure into the grid model to better capture the local
+relationships between characters within evaluation units. This innovative
+structure reduces the excessive reliance on pre-trained language models and
+emphasizes the modeling of structure and local relationships, thereby improving
+the performance of the model on Chinese financial texts. Through comparative
+experiments with advanced models such as Synchronous Double-channel Recurrent
+Network (SDRN) and Grid Tagging Scheme (GTS), the proposed GCGTS model
+demonstrates significant improvements in performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>2Gaussia: Uncertain <span class="highlight-title">Prompt</span>-learning for Script Event Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyao Cui, Xin Cong, Jiawei Sheng, Xuebin Wang, Tingwen Liu, Jinqiao Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Script Event Prediction (SEP) aims to predict the subsequent event for a
+given event chain from a candidate list. Prior research has achieved great
+success by integrating external knowledge to enhance the semantics, but it is
+laborious to acquisite the appropriate knowledge resources and retrieve the
+script-related knowledge. In this paper, we regard public pre-trained language
+models as knowledge bases and automatically mine the script-related knowledge
+via prompt-learning. Still, the scenario-diversity and label-ambiguity in
+scripts make it uncertain to construct the most functional prompt and label
+token in prompt learning, i.e., prompt-uncertainty and verbalizer-uncertainty.
+Considering the innate ability of Gaussian distribution to express uncertainty,
+we deploy the prompt tokens and label tokens as random variables following
+Gaussian distributions, where a prompt estimator and a verbalizer estimator are
+proposed to estimate their probabilistic representations instead of
+deterministic representations. We take the lead to explore prompt-learning in
+SEP and provide a fresh perspective to enrich the script semantics. Our method
+is evaluated on the most widely used benchmark and a newly proposed large-scale
+one. Experiments show that our method, which benefits from knowledge evoked
+from pre-trained language models, outperforms prior baselines by 1.46\% and
+1.05\% on two benchmarks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ N-gram Boosting: Improving Contextual Biasing with Normalized N-gram
+  Targets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Yau Li, Shreekantha Nadig, Karol Chang, Zafarullah Mahmood, Riqiang Wang, Simon Vandieken, Jonas Robertson, Fred Mailhot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate transcription of proper names and technical terms is particularly
+important in speech-to-text applications for business conversations. These
+words, which are essential to understanding the conversation, are often rare
+and therefore likely to be under-represented in text and audio training data,
+creating a significant challenge in this domain. We present a two-step keyword
+boosting mechanism that successfully works on normalized unigrams and n-grams
+rather than just single tokens, which eliminates missing hits issues with
+boosting raw targets. In addition, we show how adjusting the boosting weight
+logic avoids over-boosting multi-token keywords. This improves our keyword
+recognition rate by 26% relative on our proprietary in-domain dataset and 2% on
+LibriSpeech. This method is particularly useful on targets that involve
+non-alphabetic characters or have non-standard pronunciations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Tsallis-Entropy Minimization: A New Self-Training Approach for
+  Domain Adaptation on Text Classification <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglong Lu, Zhen Huang, Zhiliang Tian, Yunxiang Zhao, Xuanyu Fei, Dongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text classification is a fundamental task for natural language processing,
+and adapting text classification models across domains has broad applications.
+Self-training generates pseudo-examples from the model's predictions and
+iteratively trains on the pseudo-examples, i.e., minimizes the loss on the
+source domain and the Gibbs entropy on the target domain. However, Gibbs
+entropy is sensitive to prediction errors, and thus, self-training tends to
+fail when the domain shift is large. In this paper, we propose Meta-Tsallis
+Entropy minimization (MTEM), which applies a meta-learning algorithm to
+optimize the instance adaptive Tsallis entropy on the target domain. To reduce
+the computation cost of MTEM, we propose an approximation technique to
+approximate the Second-order derivation involved in the meta-learning. To
+efficiently generate pseudo labels, we propose an annealing sampling mechanism
+for exploring the model's prediction probability. Theoretically, we prove the
+convergence of the meta-learning algorithm in MTEM and analyze the
+effectiveness of MTEM in achieving domain adaptation. Experimentally, MTEM
+improves the adaptation performance of BERT with an average of 4 percent on the
+benchmark dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted by IJCAI 2023, and the uploaded file includes
+  9 pages of main contents(including two pages of reference) plus 10 pages of
+  appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good Are SOTA Fake News Detectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Iceland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic fake news detection with machine learning can prevent the
+dissemination of false statements before they gain many views. Several datasets
+labeling statements as legitimate or false have been created since the 2016
+United States presidential election for the prospect of training machine
+learning models. We evaluate the robustness of both traditional and deep
+state-of-the-art models to gauge how well they may perform in the real world.
+We find that traditional models tend to generalize better to data outside the
+distribution it was trained on compared to more recently-developed large
+language models, though the best model to use may depend on the specific task
+at hand.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> for GTFS: From Words to Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saipraneeth Devunuri, Shirin Qiam, Lewis Lehe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The General Transit Feed Specification (GTFS) standard for publishing transit
+data is ubiquitous. GTFS being tabular data, with information spread across
+different files, necessitates specialized tools or packages to retrieve
+information. Concurrently, the use of Large Language Models for text and
+information retrieval is growing. The idea of this research is to see if the
+current widely adopted LLMs (ChatGPT) are able to retrieve information from
+GTFS using natural language instructions. We first test whether ChatGPT
+(GPT-3.5) understands the GTFS specification. GPT-3.5 answers 77% of our
+multiple-choice questions (MCQ) correctly. Next, we task the LLM with
+information extractions from a filtered GTFS feed with 4 routes. For
+information retrieval, we compare zero-shot and program synthesis. Program
+synthesis works better, achieving ~90% accuracy on simple questions and ~40%
+accuracy on complex questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures, 1 table, Transportation Research Board</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Forget Demonstrations, Focus on Learning from Textual Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renze Lou, Wenpeng Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies a challenging yet more realistic setting for zero-shot
+cross-task generalization: demonstration-free learning from textual
+instructions, presuming the existence of a paragraph-style task definition
+while no demonstrations exist. To better learn the task supervision from the
+definition, we propose two strategies: first, to automatically find out the
+critical sentences in the definition; second, a ranking objective to force the
+model to generate the gold outputs with higher probabilities when those
+critical parts are highlighted in the definition. The joint efforts of the two
+strategies yield state-of-the-art performance on the challenging benchmark. Our
+code will be released in the final version of the paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounded Image Text Matching with Mismatched Relation Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Wu, Yana Wei, Haozhe Wang, Yongfei Liu, Sibei Yang, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Grounded Image Text Matching with Mismatched Relation
+(GITM-MR), a novel visual-linguistic joint task that evaluates the relation
+understanding capabilities of transformer-based pre-trained models. GITM-MR
+requires a model to first determine if an expression describes an image, then
+localize referred objects or ground the mismatched parts of the text. We
+provide a benchmark for evaluating pre-trained models on this task, with a
+focus on the challenging settings of limited data and out-of-distribution
+sentence lengths. Our evaluation demonstrates that pre-trained models lack data
+efficiency and length generalization ability. To address this, we propose the
+Relation-sensitive Correspondence Reasoning Network (RCRN), which incorporates
+relation-aware reasoning via bi-directional message propagation guided by
+language structure. RCRN can be interpreted as a modular program and delivers
+strong performance in both length generalization and data efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selective Knowledge Distillation for Non-Autoregressive Neural Machine
+  Translation <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17910v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17910v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Yu Bao, Chengqi Zhao, Shujian Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benefiting from the sequence-level knowledge distillation, the
+Non-Autoregressive Transformer (NAT) achieves great success in neural machine
+translation tasks. However, existing knowledge distillation has side effects,
+such as propagating errors from the teacher to NAT students, which may limit
+further improvements of NAT models and are rarely discussed in existing
+research. In this paper, we introduce selective knowledge distillation by
+introducing an NAT evaluator to select NAT-friendly targets that are of high
+quality and easy to learn. In addition, we introduce a simple yet effective
+progressive distillation method to boost NAT performance. Experiment results on
+multiple WMT language directions and several representative NAT models show
+that our approach can realize a flexible trade-off between the quality and
+complexity of training data for NAT models, achieving strong performances.
+Further analysis shows that distilling only 5% of the raw translations can help
+an NAT outperform its counterpart trained on raw data by about 2.4 BLEU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Label Biases for In-context Learning <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19148v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19148v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Fei, Yifan Hou, Zeming Chen, Antoine Bosselut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various design settings for in-context learning (ICL), such as the choice and
+order of the in-context examples, can bias a model toward a particular
+prediction without being reflective of an understanding of the task. While many
+studies discuss these design choices, there have been few systematic
+investigations into categorizing them and mitigating their impact. In this
+work, we define a typology for three types of label biases in ICL for text
+classification: vanilla-label bias, context-label bias, and domain-label bias
+(which we conceptualize and detect for the first time).
+  Our analysis demonstrates that prior label bias calibration methods fall
+short of addressing all three types of biases. Specifically, domain-label bias
+restricts LLMs to random-level performance on many tasks regardless of the
+choice of in-context examples. To mitigate the effect of these biases, we
+propose a simple bias calibration method that estimates a language model's
+label bias using random in-domain words from the task corpus. After controlling
+for this estimated bias when making predictions, our novel domain-context
+calibration significantly improves the ICL performance of GPT-J and GPT-3 on a
+wide range of tasks. The gain is substantial on tasks with large domain-label
+bias (up to 37% in Macro-F1). Furthermore, our results generalize to models
+with different scales, pretraining methods, and manually-designed task
+instructions, showing the prevalence of label biases in ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic WordNet Construction using Word Sense Induction through
+  Sentence Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.03251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.03251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan John Velasco, Axel Alba, Trisha Gail Pelagio, Bryce Anthony Ramirez, Jan Christian Blaise Cruz, Charibeth Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language resources such as wordnets remain indispensable tools for different
+natural language tasks and applications. However, for low-resource languages
+such as Filipino, existing wordnets are old and outdated, and producing new
+ones may be slow and costly in terms of time and resources. In this paper, we
+propose an automatic method for constructing a wordnet from scratch using only
+an unlabeled corpus and a sentence embeddings-based language model. Using this,
+we produce FilWordNet, a new wordnet that supplants and improves the outdated
+Filipino WordNet. We evaluate our automatically-induced senses and synsets by
+matching them with senses from the Princeton WordNet, as well as comparing the
+synsets to the old Filipino WordNet. We empirically show that our method can
+induce existing, as well as potentially new, senses and synsets automatically
+without the need for human supervision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, 1 table; updated with more experiments and
+  evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Turkish Native Language Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14850v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14850v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Yavuz Uluslu, Gerold Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the first application of Native Language
+Identification (NLI) for the Turkish language. NLI involves predicting the
+writer's first language by analysing their writing in different languages.
+While most NLI research has focused on English, our study extends its scope to
+Turkish. We used the recently constructed Turkish Learner Corpus and employed a
+combination of three syntactic features (CFG production rules, part-of-speech
+n-grams, and function words) with L2 texts to demonstrate their effectiveness
+in this task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Chat<span class="highlight-title">GPT</span> Replace Traditional KBQA Models? An In-depth Analysis of <span class="highlight-title">GPT</span>
+  family LLMs' Question Answering Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07992v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07992v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Tan, Dehai Min, Yu Li, Wenbo Li, Nan Hu, Yongrui Chen, Guilin Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT is a powerful large language model (LLM) that covers knowledge
+resources such as Wikipedia and supports natural language question answering
+using its own knowledge. Therefore, there is growing interest in exploring
+whether ChatGPT can replace traditional knowledge-based question answering
+(KBQA) models. Although there have been some works analyzing the question
+answering performance of ChatGPT, there is still a lack of large-scale,
+comprehensive testing of various types of complex questions to analyze the
+limitations of the model. In this paper, we present a framework that follows
+the black-box testing specifications of CheckList proposed by Ribeiro et. al.
+We evaluate ChatGPT and its family of LLMs on eight real-world KB-based complex
+question answering datasets, which include six English datasets and two
+multilingual datasets. The total number of test cases is approximately 190,000.
+In addition to the GPT family of LLMs, we also evaluate the well-known FLAN-T5
+to identify commonalities between the GPT family and other LLMs. The dataset
+and code are available at
+https://github.com/tan92hl/Complex-Question-Answering-Evaluation-of-GPT-family.git
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPeC: A Soft <span class="highlight-title">Prompt</span>-Based Calibration on Performance Variability of
+  Large Language Model in Clinical Notes Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13035v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13035v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Neng Chuang, Ruixiang Tang, Xiaoqian Jiang, Xia Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic health records (EHRs) store an extensive array of patient
+information, encompassing medical histories, diagnoses, treatments, and test
+outcomes. These records are crucial for enabling healthcare providers to make
+well-informed decisions regarding patient care. Summarizing clinical notes
+further assists healthcare professionals in pinpointing potential health risks
+and making better-informed decisions. This process contributes to reducing
+errors and enhancing patient outcomes by ensuring providers have access to the
+most pertinent and current patient data. Recent research has shown that
+incorporating prompts with large language models (LLMs) substantially boosts
+the efficacy of summarization tasks. However, we show that this approach also
+leads to increased output variance, resulting in notably divergent outputs even
+when prompts share similar meanings. To tackle this challenge, we introduce a
+model-agnostic Soft Prompt-Based Calibration (SPeC) pipeline that employs soft
+prompts to diminish variance while preserving the advantages of prompt-based
+summarization. Experimental findings on multiple clinical note tasks and LLMs
+indicate that our method not only bolsters performance but also effectively
+curbs variance for various LLMs, providing a more uniform and dependable
+solution for summarizing vital medical information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Design of Semantic Similarity Ensembles Using Grammatical
+  Evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00925v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00925v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic similarity measures are widely used in natural language processing
+to catalyze various computer-related tasks. However, no single semantic
+similarity measure is the most appropriate for all tasks, and researchers often
+use ensemble strategies to ensure performance. This research work proposes a
+method for automatically designing semantic similarity ensembles. In fact, our
+proposed method uses grammatical evolution, for the first time, to
+automatically select and aggregate measures from a pool of candidates to create
+an ensemble that maximizes correlation to human judgment. The method is
+evaluated on several benchmark datasets and compared to state-of-the-art
+ensembles, showing that it can significantly improve similarity assessment
+accuracy and outperform existing methods in some cases. As a result, our
+research demonstrates the potential of using grammatical evolution to
+automatically compare text and prove the benefits of using ensembles for
+semantic similarity tasks. The source code that illustrates our approach can be
+downloaded from https://github.com/jorge-martinez-gil/sesige.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ G3Detector: General <span class="highlight-title">GPT</span>-Generated Text Detector 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haolan Zhan, Xuanli He, Qiongkai Xu, Yuxiang Wu, Pontus Stenetorp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning progress in the field of Large Language Models (LLMs) heralds
+significant benefits due to their unparalleled capacities. However, it is
+critical to acknowledge the potential misuse of these models, which could give
+rise to a spectrum of social and ethical dilemmas. Despite numerous preceding
+efforts centered around distinguishing synthetic text, most existing detection
+systems fail to identify data synthesized by the latest LLMs, such as ChatGPT
+and GPT-4. In response to this challenge, we introduce an unpretentious yet
+potent detection approach proficient in identifying synthetic text across a
+wide array of fields. Moreover, our detector demonstrates outstanding
+performance uniformly across various model architectures and decoding
+strategies. It also possesses the capability to identify text generated
+utilizing a potent detection-evasion technique. Our comprehensive research
+underlines our commitment to boosting the robustness and efficiency of
+machine-generated text detection mechanisms, particularly in the context of
+swiftly progressing and increasingly adaptive AI technologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Encounter some tech bugs, need to refresh corresponding results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inductive reasoning in humans and large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06548v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06548v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon J. Han, Keith Ransom, Andrew Perfors, Charles Kemp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impressive recent performance of large language models has led many to
+wonder to what extent they can serve as models of general intelligence or are
+similar to human cognition. We address this issue by applying GPT-3.5 and GPT-4
+to a classic problem in human inductive reasoning known as property induction.
+Over two experiments, we elicit human judgments on a range of property
+induction tasks spanning multiple domains. Although GPT-3.5 struggles to
+capture many aspects of human behaviour, GPT-4 is much more successful: for the
+most part, its performance qualitatively matches that of humans, and the only
+notable exception is its failure to capture the phenomenon of premise
+non-monotonicity. Our work demonstrates that property induction allows for
+interesting comparisons between human and machine intelligence and provides two
+large datasets that can serve as benchmarks for future work in this vein.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>61 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hoodwinked: Deception and Cooperation in a Text-Based Game for Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01404v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01404v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan O'Gara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Are current language models capable of deception and lie detection? We study
+this question by introducing a text-based game called $\textit{Hoodwinked}$,
+inspired by Mafia and Among Us. Players are locked in a house and must find a
+key to escape, but one player is tasked with killing the others. Each time a
+murder is committed, the surviving players have a natural language discussion
+then vote to banish one player from the game. We conduct experiments with
+agents controlled by GPT-3, GPT-3.5, and GPT-4 and find evidence of deception
+and lie detection capabilities. The killer often denies their crime and accuses
+others, leading to measurable effects on voting outcomes. More advanced models
+are more effective killers, outperforming smaller models in 18 of 24 pairwise
+comparisons. Secondary metrics provide evidence that this improvement is not
+mediated by different actions, but rather by stronger persuasive skills during
+discussions. To evaluate the ability of AI agents to deceive humans, we make
+this game publicly available at h https://hoodwinked.ai/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added reference for McKenzie 2023; updated acknowledgements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Local Spectro-Temporal Features for Speech Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10270v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10270v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Guerzhoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the problem of phone classification in the context of speech
+recognition, and explore several sets of local spectro-temporal features that
+can be used for phone classification. In particular, we present some
+preliminary results for phone classification using two sets of features that
+are commonly used for object detection: Haar features and SVM-classified
+Histograms of Gradients (HoG).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Master's project, University of Toronto, 2010</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Multimodal Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01008v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01008v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongshuo Zong, Oisin Mac Aodha, Timothy Hospedales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning, which aims to understand and analyze information from
+multiple modalities, has achieved substantial progress in the supervised regime
+in recent years. However, the heavy dependence on data paired with expensive
+human annotations impedes scaling up models. Meanwhile, given the availability
+of large-scale unannotated data in the wild, self-supervised learning has
+become an attractive strategy to alleviate the annotation bottleneck. Building
+on these two directions, self-supervised multimodal learning (SSML) provides
+ways to learn from raw multimodal data. In this survey, we provide a
+comprehensive review of the state-of-the-art in SSML, in which we elucidate
+three major challenges intrinsic to self-supervised learning with multimodal
+data: (1) learning representations from multimodal data without labels, (2)
+fusion of different modalities, and (3) learning with unaligned data. We then
+detail existing solutions to these challenges. Specifically, we consider (1)
+objectives for learning from multimodal unlabeled data via self-supervision,
+(2) model architectures from the perspective of different multimodal fusion
+strategies, and (3) pair-free learning strategies for coarse-grained and
+fine-grained alignment. We also review real-world applications of SSML
+algorithms in diverse fields such as healthcare, remote sensing, and machine
+translation. Finally, we discuss challenges and future directions for SSML. A
+collection of related resources can be found at:
+https://github.com/ys-zong/awesome-self-supervised-multimodal-learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GIST: Generating Image-Specific Text for Fine-grained Object
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11315v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11315v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kathleen M. Lewis, Emily Mu, Adrian V. Dalca, John Guttag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision-language models outperform vision-only models on many image
+classification tasks. However, because of the absence of paired text/image
+descriptions, it remains difficult to fine-tune these models for fine-grained
+image classification. In this work, we propose a method, GIST, for generating
+image-specific fine-grained text descriptions from image-only datasets, and
+show that these text descriptions can be used to improve classification. Key
+parts of our method include 1. prompting a pretrained large language model with
+domain-specific prompts to generate diverse fine-grained text descriptions for
+each class and 2. using a pretrained vision-language model to match each image
+to label-preserving text descriptions that capture relevant visual features in
+the image. We demonstrate the utility of GIST by fine-tuning vision-language
+models on the image-and-generated-text pairs to learn an aligned
+vision-language representation space for improved classification. We evaluate
+our learned representation space in full-shot and few-shot scenarios across
+four diverse fine-grained classification datasets, each from a different
+domain. Our method achieves an average improvement of $4.1\%$ in accuracy over
+CLIP linear probes and an average of $1.1\%$ improvement in accuracy over the
+previous state-of-the-art image-text classification method on the full-shot
+datasets. Our method achieves similar improvements across few-shot regimes.
+Code is available at https://github.com/emu1729/GIST.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally to this work and are listed
+  in alphabetical order</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graphologue: Exploring Large Language Model Responses with Interactive
+  Diagrams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiling Jiang, Jude Rayan, Steven P. Dow, Haijun Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently soared in popularity due to their
+ease of access and the unprecedented ability to synthesize text responses to
+diverse user questions. However, LLMs like ChatGPT present significant
+limitations in supporting complex information tasks due to the insufficient
+affordances of the text-based medium and linear conversational structure.
+Through a formative study with ten participants, we found that LLM interfaces
+often present long-winded responses, making it difficult for people to quickly
+comprehend and interact flexibly with various pieces of information,
+particularly during more complex tasks. We present Graphologue, an interactive
+system that converts text-based responses from LLMs into graphical diagrams to
+facilitate information-seeking and question-answering tasks. Graphologue
+employs novel prompting strategies and interface designs to extract entities
+and relationships from LLM responses and constructs node-link diagrams in
+real-time. Further, users can interact with the diagrams to flexibly adjust the
+graphical presentation and to submit context-specific prompts to obtain more
+information. Utilizing diagrams, Graphologue enables graphical, non-linear
+dialogues between humans and LLMs, facilitating information exploration,
+organization, and comprehension.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">73</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MM-Vet, an evaluation benchmark that examines large multimodal
+models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various
+intriguing abilities, such as solving math problems written on the blackboard,
+reasoning about events and celebrities in news images, and explaining visual
+jokes. Rapid model advancements pose challenges to evaluation benchmark
+development. Problems include: (1) How to systematically structure and evaluate
+the complicated multimodal tasks; (2) How to design evaluation metrics that
+work well across question and answer types; and (3) How to give model insights
+beyond a simple performance ranking. To this end, we present MM-Vet, designed
+based on the insight that the intriguing ability to solve complicated tasks is
+often achieved by a generalist model being able to integrate different core
+vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and
+examines the 16 integrations of interest derived from the capability
+combination. For evaluation metrics, we propose an LLM-based evaluator for
+open-ended outputs. The evaluator enables the evaluation across different
+question types and answer styles, resulting in a unified scoring metric. We
+evaluate representative LMMs on MM-Vet, providing insights into the
+capabilities of different LMM system paradigms and models. Code and data are
+available at https://github.com/yuweihao/MM-Vet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data: https://github.com/yuweihao/MM-Vet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen
+  Convolutional CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qihang Yu, Ju He, Xueqing Deng, Xiaohui Shen, Liang-Chieh Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary segmentation is a challenging task requiring segmenting and
+recognizing objects from an open set of categories. One way to address this
+challenge is to leverage multi-modal models, such as CLIP, to provide image and
+text features in a shared embedding space, which bridges the gap between
+closed-vocabulary and open-vocabulary recognition. Hence, existing methods
+often adopt a two-stage framework to tackle the problem, where the inputs first
+go through a mask generator and then through the CLIP model along with the
+predicted masks. This process involves extracting features from images multiple
+times, which can be ineffective and inefficient. By contrast, we propose to
+build everything into a single-stage framework using a shared Frozen
+Convolutional CLIP backbone, which not only significantly simplifies the
+current two-stage pipeline, but also remarkably yields a better accuracy-cost
+trade-off. The proposed FC-CLIP, benefits from the following observations: the
+frozen CLIP backbone maintains the ability of open-vocabulary classification
+and can also serve as a strong mask generator, and the convolutional CLIP
+generalizes well to a larger input resolution than the one used during
+contrastive image-text pretraining. When training on COCO panoptic data only
+and testing in a zero-shot manner, FC-CLIP achieve 26.8 PQ, 16.8 AP, and 34.1
+mIoU on ADE20K, 18.2 PQ, 27.9 mIoU on Mapillary Vistas, 44.0 PQ, 26.8 AP, 56.2
+mIoU on Cityscapes, outperforming the prior art by +4.2 PQ, +2.4 AP, +4.2 mIoU
+on ADE20K, +4.0 PQ on Mapillary Vistas and +20.1 PQ on Cityscapes,
+respectively. Additionally, the training and testing time of FC-CLIP is 7.5x
+and 6.6x significantly faster than the same prior art, while using 5.9x fewer
+parameters. FC-CLIP also sets a new state-of-the-art performance across various
+open-vocabulary semantic segmentation datasets. Code at
+https://github.com/bytedance/fc-clip
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code and model available at https://github.com/bytedance/fc-clip</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalist Foundation Model for Radiology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Wu, Xiaoman Zhang, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we aim to initiate the development of Radiology Foundation
+Model, termed as RadFM.We consider the construction of foundational models from
+the perspectives of data, model design, and evaluation thoroughly. Our
+contribution can be concluded as follows: (i), we construct a large-scale
+Medical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.
+To the best of our knowledge, this is the first multi-modal dataset containing
+3D medical scans. (ii), We propose an architecture that enables visually
+conditioned generative pre-training, allowing for the integration of text input
+interleaved with 2D or 3D medical scans to generate response for diverse
+radiologic tasks. The model was initially pre-trained on MedMD and subsequently
+domain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,
+containing 3M radiologic visual-language pairs. (iii), we propose a new
+evaluation benchmark that comprises five tasks, aiming to comprehensively
+assess the capability of foundation models in handling practical clinical
+problems. Our experimental results confirm that RadFM significantly outperforms
+existing multi-modal foundation models. The codes, data, and model checkpoint
+will all be made publicly available to promote further research and development
+in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bi-variant Variational Model for Diffeomorphic Image Registration with
+  Relaxed Jacobian Determinant Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyan Li, Ke Chen, Chong Chen, Jianping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffeomorphic registration has become a powerful approach for seeking a
+smooth and invertible spatial transformation between two coordinate systems
+which have been measured via the template and reference images. While the
+pointwise volume-preserving constraint is effective for some problems, it is
+too stringent for many other problems especially when the local deformations
+are relatively large, because it may lead to a poor large-deformation for
+enforcing local matching.In this paper, we propose a novel bi-variant
+diffeomorphic image registration model with the soft constraint of Jacobian
+equation, which allows local deformations to shrink and grow in a flexible
+range.The Jacobian determinant of the transformation is explicitly controlled
+by optimizing the relaxation function. To prevent deformation folding and
+enhance the smoothness of deformation, we not only impose a positivity
+constraint in optimizing the relaxation function, but also employ a regularizer
+to ensure the smoothness of the relaxation function.Furthermore, the positivity
+constraint ensures that is as close to one as possible, which helps to obtain a
+volume-preserving transformation on average.We further analyze the existence of
+the minimizer for the variational model and propose a penalty splitting method
+with a multilevel strategy to solve this model. Numerical experiments show that
+the proposed algorithm is convergent, and the positivity constraint can control
+the range of relative volume and not compromise registration accuracy.
+Moreover, the proposed model produces diffeomorphic maps for large deformation,
+and achieves better performance compared to the several existing registration
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Defensive Underpainting Patch: Making Your Text Invisible to
+  Optical Character Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        JiaCheng Deng, Li Dong, Jiahao Chen, Diqun Yan, Rangding Wang, Dengpan Ye, Lingchen Zhao, Jinyu Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Character Recognition (OCR) enables automatic text extraction from
+scanned or digitized text images, but it also makes it easy to pirate valuable
+or sensitive text from these images. Previous methods to prevent OCR piracy by
+distorting characters in text images are impractical in real-world scenarios,
+as pirates can capture arbitrary portions of the text images, rendering the
+defenses ineffective. In this work, we propose a novel and effective defense
+mechanism termed the Universal Defensive Underpainting Patch (UDUP) that
+modifies the underpainting of text images instead of the characters. UDUP is
+created through an iterative optimization process to craft a small, fixed-size
+defensive patch that can generate non-overlapping underpainting for text images
+of any size. Experimental results show that UDUP effectively defends against
+unauthorized OCR under the setting of any screenshot range or complex image
+background. It is agnostic to the content, size, colors, and languages of
+characters, and is robust to typical image operations such as scaling and
+compressing. In addition, the transferability of UDUP is demonstrated by
+evading several off-the-shelf OCRs. The code is available at
+https://github.com/QRICKDD/UDUP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Brain MRI Segmentation using Template-Based Training and Visual
+  Perception Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fang-Cheng Yeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models usually require sufficient training data to achieve high
+accuracy, but obtaining labeled data can be time-consuming and labor-intensive.
+Here we introduce a template-based training method to train a 3D U-Net model
+from scratch using only one population-averaged brain MRI template and its
+associated segmentation label. The process incorporated visual perception
+augmentation to enhance the model's robustness in handling diverse image inputs
+and mitigating overfitting. Leveraging this approach, we trained 3D U-Net
+models for mouse, rat, marmoset, rhesus, and human brain MRI to achieve
+segmentation tasks such as skull-stripping, brain segmentation, and tissue
+probability mapping. This tool effectively addresses the limited availability
+of training data and holds significant potential for expanding deep learning
+applications in image analysis, providing researchers with a unified solution
+to train deep neural networks with only one image sample.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ T-UNet: Triplet UNet for Change Detection in High-Resolution Remote
+  Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Zhong, Chen Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing image change detection aims to identify the differences
+between images acquired at different times in the same area. It is widely used
+in land management, environmental monitoring, disaster assessment and other
+fields. Currently, most change detection methods are based on Siamese network
+structure or early fusion structure. Siamese structure focuses on extracting
+object features at different times but lacks attention to change information,
+which leads to false alarms and missed detections. Early fusion (EF) structure
+focuses on extracting features after the fusion of images of different phases
+but ignores the significance of object features at different times for
+detecting change details, making it difficult to accurately discern the edges
+of changed objects. To address these issues and obtain more accurate results,
+we propose a novel network, Triplet UNet(T-UNet), based on a three-branch
+encoder, which is capable to simultaneously extract the object features and the
+change features between the pre- and post-time-phase images through triplet
+encoder. To effectively interact and fuse the features extracted from the three
+branches of triplet encoder, we propose a multi-branch spatial-spectral
+cross-attention module (MBSSCA). In the decoder stage, we introduce the channel
+attention mechanism (CAM) and spatial attention mechanism (SAM) to fully mine
+and integrate detailed textures information at the shallow layer and semantic
+localization information at the deep layer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 11 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Parameter-efficient Multi-subject Model for Predicting fMRI Activity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Connor Lane, Gregory Kiar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This is the Algonauts 2023 submission report for team "BlobGPT". Our model
+consists of a multi-subject linear encoding head attached to a pretrained trunk
+model. The multi-subject head consists of three components: (1) a shared
+multi-layer feature projection, (2) shared plus subject-specific low-dimension
+linear transformations, and (3) a shared PCA fMRI embedding. In this report, we
+explain these components in more detail and present some experimental results.
+Our code is available at https://github.com/cmi-dair/algonauts23.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RobustMQ: Benchmarking Robustness of Quantized Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yisong Xiao, Aishan Liu, Tianyuan Zhang, Haotong Qin, Jinyang Guo, Xianglong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has emerged as an essential technique for deploying deep neural
+networks (DNNs) on devices with limited resources. However, quantized models
+exhibit vulnerabilities when exposed to various noises in real-world
+applications. Despite the importance of evaluating the impact of quantization
+on robustness, existing research on this topic is limited and often disregards
+established principles of robustness evaluation, resulting in incomplete and
+inconclusive findings. To address this gap, we thoroughly evaluated the
+robustness of quantized models against various noises (adversarial attacks,
+natural corruptions, and systematic noises) on ImageNet. The comprehensive
+evaluation results empirically provide valuable insights into the robustness of
+quantized models in various scenarios, for example: (1) quantized models
+exhibit higher adversarial robustness than their floating-point counterparts,
+but are more vulnerable to natural corruptions and systematic noises; (2) in
+general, increasing the quantization bit-width results in a decrease in
+adversarial robustness, an increase in natural robustness, and an increase in
+systematic robustness; (3) among corruption methods, \textit{impulse noise} and
+\textit{glass blur} are the most harmful to quantized models, while
+\textit{brightness} has the least impact; (4) among systematic noises, the
+\textit{nearest neighbor interpolation} has the highest impact, while bilinear
+interpolation, cubic interpolation, and area interpolation are the three least
+harmful. Our research contributes to advancing the robust quantization of
+models and their deployment in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class Incremental Learning with <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-Train</span>ing and
+  Prototype Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhuo Liu, Xinjian Wu, Fei Zhu, Mingming Yu, Chuang Wang, Cheng-Lin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Network (DNN) has achieved great success on datasets of closed
+class set. However, new classes, like new categories of social media topics,
+are continuously added to the real world, making it necessary to incrementally
+learn. This is hard for DNN because it tends to focus on fitting to new classes
+while ignoring old classes, a phenomenon known as catastrophic forgetting.
+State-of-the-art methods rely on knowledge distillation and data replay
+techniques but still have limitations. In this work, we analyze the causes of
+catastrophic forgetting in class incremental learning, which owes to three
+factors: representation drift, representation confusion, and classifier
+distortion. Based on this view, we propose a two-stage learning framework with
+a fixed encoder and an incrementally updated prototype classifier. The encoder
+is trained with self-supervised learning to generate a feature space with high
+intrinsic dimensionality, thus improving its transferability and generality.
+The classifier incrementally learns new prototypes while retaining the
+prototypes of previously learned data, which is crucial in preserving the
+decision boundary.Our method does not rely on preserved samples of old classes,
+is thus a non-exemplar based CIL method. Experiments on public datasets show
+that our method can significantly outperform state-of-the-art exemplar-based
+methods when they reserved 5 examplers per class, under the incremental setting
+of 10 phases, by 18.24% on CIFAR-100 and 9.37% on ImageNet100.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been under review by a journal since 19-Apr-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Image Priors for MRI Reconstruction Trained from
+  Magnitude-Only Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanxiong Luo, Xiaoqing Wang, Mortiz Blumenthal, Martin Schilling, Erik Hans Ulrich Rauf, Raviteja Kotikalapudi, Niels Focke, Martin Uecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: In this work, we present a workflow to construct generic and robust
+generative image priors from magnitude-only images. The priors can then be used
+for regularization in reconstruction to improve image quality. Methods: The
+workflow begins with the preparation of training datasets from magnitude-only
+MR images. This dataset is then augmented with phase information and used to
+train generative priors of complex images. Finally, trained priors are
+evaluated using both linear and nonlinear reconstruction for compressed sensing
+parallel imaging with various undersampling schemes. Results: The results of
+our experiments demonstrate that priors trained on complex images outperform
+priors trained only on magnitude images. Additionally, a prior trained on a
+larger dataset exhibits higher robustness. Finally, we show that the generative
+priors are superior to L1 -wavelet regularization for compressed sensing
+parallel imaging with high undersampling. Conclusion: These findings stress the
+importance of incorporating phase information and leveraging large datasets to
+raise the performance and reliability of the generative priors for MRI
+reconstruction. Phase augmentation makes it possible to use existing image
+databases for training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Scene Graph Generation with Superpixel-Based Interaction
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Wang, Can Zhang, Jinfa Huang, Botao Ren, Zhidong Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Scene Graph Generation (SGG) typically model the
+relationships among entities utilizing box-level features from pre-defined
+detectors. We argue that an overlooked problem in SGG is the coarse-grained
+interactions between boxes, which inadequately capture contextual semantics for
+relationship modeling, practically limiting the development of the field. In
+this paper, we take the initiative to explore and propose a generic paradigm
+termed Superpixel-based Interaction Learning (SIL) to remedy coarse-grained
+interactions at the box level. It allows us to model fine-grained interactions
+at the superpixel level in SGG. Specifically, (i) we treat a scene as a set of
+points and cluster them into superpixels representing sub-regions of the scene.
+(ii) We explore intra-entity and cross-entity interactions among the
+superpixels to enrich fine-grained interactions between entities at an earlier
+stage. Extensive experiments on two challenging benchmarks (Visual Genome and
+Open Image V6) prove that our SIL enables fine-grained interaction at the
+superpixel level above previous box-level methods, and significantly
+outperforms previous state-of-the-art methods across all metrics. More
+encouragingly, the proposed method can be applied to boost the performance of
+existing box-level approaches in a plug-and-play fashion. In particular, SIL
+brings an average improvement of 2.0% mR (even up to 3.4%) of baselines for the
+PredCls task on Visual Genome, which facilitates its integration into any
+existing box-level method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyang Mao, Wei Ju, Yifang Qin, Xiao Luo, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph classification is a crucial task in many real-world multimedia
+applications, where graphs can represent various multimedia data types such as
+images, videos, and social networks. Previous efforts have applied graph neural
+networks (GNNs) in balanced situations where the class distribution is
+balanced. However, real-world data typically exhibit long-tailed class
+distributions, resulting in a bias towards the head classes when using GNNs and
+limited generalization ability over the tail classes. Recent approaches mainly
+focus on re-balancing different classes during model training, which fails to
+explicitly introduce new knowledge and sacrifices the performance of the head
+classes. To address these drawbacks, we propose a novel framework called
+Retrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature
+extractor and an unbiased classifier in a decoupled manner. In the feature
+extractor training stage, we develop a graph retrieval module to search for
+relevant graphs that directly enrich the intra-class diversity for the tail
+classes. Moreover, we innovatively optimize a category-centered supervised
+contrastive loss to obtain discriminative representations, which is more
+suitable for long-tailed scenarios. In the classifier fine-tuning stage, we
+balance the classifier weights with two weight regularization techniques, i.e.,
+Max-norm and weight decay. Experiments on various popular benchmarks verify the
+superiority of the proposed method against state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the ACM International Conference on Multimedia (MM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-Augmented Depth Prediction with Sparse Annotations <span class="chip">ACM MM'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Li, Yiran Wang, Zihao Huang, Jinghong Zheng, Ke Xian, Zhiguo Cao, Jianming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation aims to predict dense depth maps. In autonomous driving
+scenes, sparsity of annotations makes the task challenging. Supervised models
+produce concave objects due to insufficient structural information. They
+overfit to valid pixels and fail to restore spatial structures. Self-supervised
+methods are proposed for the problem. Their robustness is limited by pose
+estimation, leading to erroneous results in natural scenes. In this paper, we
+propose a supervised framework termed Diffusion-Augmented Depth Prediction
+(DADP). We leverage the structural characteristics of diffusion model to
+enforce depth structures of depth models in a plug-and-play manner. An
+object-guided integrality loss is also proposed to further enhance regional
+structure integrality by fetching objective information. We evaluate DADP on
+three driving benchmarks and achieve significant improvements in depth
+structures and robustness. Our work provides a new perspective on depth
+estimation with sparse annotations in autonomous driving scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM'2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SURE-Val: Safe Urban Relevance Extension and Validation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Storms, Ken Mori, Steven Peters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To evaluate perception components of an automated driving system, it is
+necessary to define the relevant objects. While the urban domain is popular
+among perception datasets, relevance is insufficiently specified for this
+domain. Therefore, this work adopts an existing method to define relevance in
+the highway domain and expands it to the urban domain. While different
+conceptualizations and definitions of relevance are present in literature,
+there is a lack of methods to validate these definitions. Therefore, this work
+presents a novel relevance validation method leveraging a motion prediction
+component. The validation leverages the idea that removing irrelevant objects
+should not influence a prediction component which reflects human driving
+behavior. The influence on the prediction is quantified by considering the
+statistical distribution of prediction performance across a large-scale
+dataset. The validation procedure is verified using criteria specifically
+designed to exclude relevant objects. The validation method is successfully
+applied to the relevance criteria from this work, thus supporting their
+validity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Uni-DAS e.V. Workshop Fahrerassistenz und automatisiertes
+  Fahren</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Calibration of Uncertainty Estimation in LiDAR-based Semantic
+  Segmentation <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariella Dreissig, Florian Piewak, Joschka Boedecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The confidence calibration of deep learning-based perception models plays a
+crucial role in their reliability. Especially in the context of autonomous
+driving, downstream tasks like prediction and planning depend on accurate
+confidence estimates. In point-wise multiclass classification tasks like
+sematic segmentation the model has to deal with heavy class imbalances. Due to
+their underrepresentation, the confidence calibration of classes with smaller
+instances is challenging but essential, not only for safety reasons. We propose
+a metric to measure the confidence calibration quality of a semantic
+segmentation model with respect to individual classes. It is calculated by
+computing sparsification curves for each class based on the uncertainty
+estimates. We use the classification calibration metric to evaluate uncertainty
+estimation methods with respect to their confidence calibration of
+underrepresented classes. We furthermore suggest a double use for the method to
+automatically find label problems to improve the quality of hand- or
+auto-annotated datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at IEEE ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via
+  Deformable Template Field <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowen Wang, Zhipeng Fan, Zhen Zhao, Zhengping Che, Zhiyuan Xu, Dong Liu, Feifei Feng, Yakun Huang, Xiuquan Qiao, Jian Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating 6D poses and reconstructing 3D shapes of objects in open-world
+scenes from RGB-depth image pairs is challenging. Many existing methods rely on
+learning geometric features that correspond to specific templates while
+disregarding shape variations and pose differences among objects in the same
+category. As a result, these methods underperform when handling unseen object
+instances in complex environments. In contrast, other approaches aim to achieve
+category-level estimation and reconstruction by leveraging normalized geometric
+structure priors, but the static prior-based reconstruction struggles with
+substantial intra-class variations. To solve these problems, we propose the
+DTF-Net, a novel framework for pose estimation and shape reconstruction based
+on implicit neural fields of object categories. In DTF-Net, we design a
+deformable template field to represent the general category-wise shape latent
+features and intra-category geometric deformation features. The field
+establishes continuous shape correspondences, deforming the category template
+into arbitrary observed instances to accomplish shape reconstruction. We
+introduce a pose regression module that shares the deformation features and
+template codes from the fields to estimate the accurate 6D pose of each object
+in the scene. We integrate a multi-modal representation extraction module to
+extract object features and semantic masks, enabling end-to-end inference.
+Moreover, during training, we implement a shape-invariant training strategy and
+a viewpoint sampling method to further enhance the model's capability to
+extract object pose features. Extensive experiments on the REAL275 and CAMERA25
+datasets demonstrate the superiority of DTF-Net in both synthetic and real
+scenes. Furthermore, we show that DTF-Net effectively supports grasping tasks
+with a real robot arm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors are with equal contributions. Paper accepted by
+  ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MSECNet: Accurate and Robust Normal Estimation for 3D Point Clouds by
+  Multi-Scale Edge Conditioning <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Xiu, Xin Liu, Weimin Wang, Kyoung-Sook Kim, Masashi Matsuoka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating surface normals from 3D point clouds is critical for various
+applications, including surface reconstruction and rendering. While existing
+methods for normal estimation perform well in regions where normals change
+slowly, they tend to fail where normals vary rapidly. To address this issue, we
+propose a novel approach called MSECNet, which improves estimation in normal
+varying regions by treating normal variation modeling as an edge detection
+problem. MSECNet consists of a backbone network and a multi-scale edge
+conditioning (MSEC) stream. The MSEC stream achieves robust edge detection
+through multi-scale feature fusion and adaptive edge detection. The detected
+edges are then combined with the output of the backbone network using the edge
+conditioning module to produce edge-aware representations. Extensive
+experiments show that MSECNet outperforms existing methods on both synthetic
+(PCPNet) and real-world (SceneNN) datasets while running significantly faster.
+We also conduct various analyses to investigate the contribution of each
+component in the MSEC stream. Finally, we demonstrate the effectiveness of our
+approach in surface reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FB-BEV: BEV Representation from Forward-Backward View Transformations <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqi Li, Zhiding Yu, Wenhai Wang, Anima Anandkumar, Tong Lu, Jose M. Alvarez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  View Transformation Module (VTM), where transformations happen between
+multi-view image features and Bird-Eye-View (BEV) representation, is a crucial
+step in camera-based BEV perception systems. Currently, the two most prominent
+VTM paradigms are forward projection and backward projection. Forward
+projection, represented by Lift-Splat-Shoot, leads to sparsely projected BEV
+features without post-processing. Backward projection, with BEVFormer being an
+example, tends to generate false-positive BEV features from incorrect
+projections due to the lack of utilization on depth. To address the above
+limitations, we propose a novel forward-backward view transformation module.
+Our approach compensates for the deficiencies in both existing methods,
+allowing them to enhance each other to obtain higher quality BEV
+representations mutually. We instantiate the proposed module with FB-BEV, which
+achieves a new state-of-the-art result of 62.4\% NDS on the nuScenes test set.
+The code will be released at \url{https://github.com/NVlabs/FB-BEV}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Painterly Image Harmonization using Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingxiao Lu, Jiangtong Li, Junyan Cao, Li Niu, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Painterly image harmonization aims to insert photographic objects into
+paintings and obtain artistically coherent composite images. Previous methods
+for this task mainly rely on inference optimization or generative adversarial
+network, but they are either very time-consuming or struggling at fine control
+of the foreground objects (e.g., texture and content details). To address these
+issues, we propose a novel Painterly Harmonization stable Diffusion model
+(PHDiffusion), which includes a lightweight adaptive encoder and a Dual Encoder
+Fusion (DEF) module. Specifically, the adaptive encoder and the DEF module
+first stylize foreground features within each encoder. Then, the stylized
+foreground features from both encoders are combined to guide the harmonization
+process. During training, besides the noise loss in diffusion model, we
+additionally employ content loss and two style losses, i.e., AdaIN style loss
+and contrastive style loss, aiming to balance the trade-off between style
+migration and content preservation. Compared with the state-of-the-art models
+from related fields, our PHDiffusion can stylize the foreground more
+sufficiently and simultaneously retain finer content. Our code and model are
+available at https://github.com/bcmi/PHDiffusion-Painterly-Image-Harmonization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Semantic Model Fusion for Ancient Agricultural Terrace Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Wang, Chenying Liu, Arti Tiwari, Micha Silver, Arnon Karnieli, Xiao Xiang Zhu, Conrad M Albrecht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering ancient agricultural terraces in desert regions is important for
+the monitoring of long-term climate changes on the Earth's surface. However,
+traditional ground surveys are both costly and limited in scale. With the
+increasing accessibility of aerial and satellite data, machine learning
+techniques bear large potential for the automatic detection and recognition of
+archaeological landscapes. In this paper, we propose a deep semantic model
+fusion method for ancient agricultural terrace detection. The input data
+includes aerial images and LiDAR generated terrain features in the Negev
+desert. Two deep semantic segmentation models, namely DeepLabv3+ and UNet, with
+EfficientNet backbone, are trained and fused to provide segmentation maps of
+ancient terraces and walls. The proposed method won the first prize in the
+International AI Archaeology Challenge. Codes are available at
+https://github.com/wangyi111/international-archaeology-ai-challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Big Data 2022 workshop on Digital Twins for Accelerated
+  Discovery of Climate & Sustainability Solutions (ADoCS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balanced Classification: A Unified Framework for Long-Tailed Object
+  Detection <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianhao Qi, Hongtao Xie, Pandeng Li, Jiannan Ge, Yongdong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional detectors suffer from performance degradation when dealing with
+long-tailed data due to a classification bias towards the majority head
+categories. In this paper, we contend that the learning bias originates from
+two factors: 1) the unequal competition arising from the imbalanced
+distribution of foreground categories, and 2) the lack of sample diversity in
+tail categories. To tackle these issues, we introduce a unified framework
+called BAlanced CLassification (BACL), which enables adaptive rectification of
+inequalities caused by disparities in category distribution and dynamic
+intensification of sample diversities in a synchronized manner. Specifically, a
+novel foreground classification balance loss (FCBL) is developed to ameliorate
+the domination of head categories and shift attention to
+difficult-to-differentiate categories by introducing pairwise class-aware
+margins and auto-adjusted weight terms, respectively. This loss prevents the
+over-suppression of tail categories in the context of unequal competition.
+Moreover, we propose a dynamic feature hallucination module (FHM), which
+enhances the representation of tail categories in the feature space by
+synthesizing hallucinated samples to introduce additional data variances. In
+this divide-and-conquer approach, BACL sets a new state-of-the-art on the
+challenging LVIS benchmark with a decoupled training pipeline, surpassing
+vanilla Faster R-CNN with ResNet-50-FPN by 5.8% AP and 16.1% AP for overall and
+tail categories. Extensive experiments demonstrate that BACL consistently
+achieves performance improvements across various datasets with different
+backbones and architectures. Code and models are available at
+https://github.com/Tianhao-Qi/BACL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Multimedia, to be published; Code:
+  https://github.com/Tianhao-Qi/BACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Paired Competing Neurons Improving STDP Supervised Local Learning In
+  Spiking Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaspard Goupy, Pierre Tirilly, Ioan Marius Bilasco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Direct training of Spiking Neural Networks (SNNs) on neuromorphic hardware
+has the potential to significantly reduce the high energy consumption of
+Artificial Neural Networks (ANNs) training on modern computers. The biological
+plausibility of SNNs allows them to benefit from bio-inspired plasticity rules,
+such as Spike Timing-Dependent Plasticity (STDP). STDP offers gradient-free and
+unsupervised local learning, which can be easily implemented on neuromorphic
+hardware. However, relying solely on unsupervised STDP to perform
+classification tasks is not enough. In this paper, we propose Stabilized
+Supervised STDP (S2-STDP), a supervised STDP learning rule to train the
+classification layer of an SNN equipped with unsupervised STDP. S2-STDP
+integrates error-modulated weight updates that align neuron spikes with desired
+timestamps derived from the average firing time within the layer. Then, we
+introduce a training architecture called Paired Competing Neurons (PCN) to
+further enhance the learning capabilities of our classification layer trained
+with S2-STDP. PCN associates each class with paired neurons and encourages
+neuron specialization through intra-class competition. We evaluated our
+proposed methods on image recognition datasets, including MNIST, Fashion-MNIST,
+and CIFAR-10. Results showed that our methods outperform current supervised
+STDP-based state of the art, for comparable architectures and numbers of
+neurons. Also, the use of PCN enhances the performance of S2-STDP, regardless
+of the configuration, and without introducing any hyperparameters.Further
+analysis demonstrated that our methods exhibited improved hyperparameter
+robustness, which reduces the need for tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ES-MVSNet: Efficient Framework for End-to-end <span class="highlight-title">Self-supervised</span> Multi-View
+  Stereo 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Zhou, Chaohui Yu, Jingliang Li, Yuang Liu, Jing Wang, Zhibin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compared to the multi-stage self-supervised multi-view stereo (MVS) method,
+the end-to-end (E2E) approach has received more attention due to its concise
+and efficient training pipeline. Recent E2E self-supervised MVS approaches have
+integrated third-party models (such as optical flow models, semantic
+segmentation models, NeRF models, etc.) to provide additional consistency
+constraints, which grows GPU memory consumption and complicates the model's
+structure and training pipeline. In this work, we propose an efficient
+framework for end-to-end self-supervised MVS, dubbed ES-MVSNet. To alleviate
+the high memory consumption of current E2E self-supervised MVS frameworks, we
+present a memory-efficient architecture that reduces memory usage by 43%
+without compromising model performance. Furthermore, with the novel design of
+asymmetric view selection policy and region-aware depth consistency, we achieve
+state-of-the-art performance among E2E self-supervised MVS methods, without
+relying on third-party models for additional consistency signals. Extensive
+experiments on DTU and Tanks&Temples benchmarks demonstrate that the proposed
+ES-MVSNet approach achieves state-of-the-art performance among E2E
+self-supervised MVS methods and competitive performance to many supervised and
+multi-stage self-supervised methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2203.03949 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic outlier generation for anomaly detection in autonomous driving <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Bikandi, Gorka Velez, Naiara Aginako, Itziar Irigoien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection, or outlier detection, is a crucial task in various domains
+to identify instances that significantly deviate from established patterns or
+the majority of data. In the context of autonomous driving, the identification
+of anomalies is particularly important to prevent safety-critical incidents, as
+deep learning models often exhibit overconfidence in anomalous or outlier
+samples. In this study, we explore different strategies for training an image
+semantic segmentation model with an anomaly detection module. By introducing
+modifications to the training stage of the state-of-the-art DenseHybrid model,
+we achieve significant performance improvements in anomaly detection. Moreover,
+we propose a simplified detector that achieves comparable results to our
+modified DenseHybrid approach, while also surpassing the performance of the
+original DenseHybrid model. These findings demonstrate the efficacy of our
+proposed strategies for enhancing anomaly detection in the context of
+autonomous driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 26th IEEE International Conference on Intelligent
+  Transportation Systems (ITSC 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scene-aware Human Pose Generation using <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieteng Yao, Junjie Chen, Li Niu, Bin Sheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Affordance learning considers the interaction opportunities for an actor in
+the scene and thus has wide application in scene understanding and intelligent
+robotics. In this paper, we focus on contextual affordance learning, i.e.,
+using affordance as context to generate a reasonable human pose in a scene.
+Existing scene-aware human pose generation methods could be divided into two
+categories depending on whether using pose templates. Our proposed method
+belongs to the template-based category, which benefits from the representative
+pose templates. Moreover, inspired by recent transformer-based methods, we
+associate each query embedding with a pose template, and use the interaction
+between query embeddings and scene feature map to effectively predict the scale
+and offsets for each pose template. In addition, we employ knowledge
+distillation to facilitate the offset learning given the predicted scale.
+Comprehensive experiments on Sitcom dataset demonstrate the effectiveness of
+our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Labelling of Affective Video <span class="highlight-title">Dataset</span>s via Few-Shot &
+  Multi-Task Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravikiran Parameshwara, Ibrahim Radwan, Akshay Asthana, Iman Abbasnejad, Ramanathan Subramanian, Roland Goecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whilst deep learning techniques have achieved excellent emotion prediction,
+they still require large amounts of labelled training data, which are (a)
+onerous and tedious to compile, and (b) prone to errors and biases. We propose
+Multi-Task Contrastive Learning for Affect Representation (\textbf{MT-CLAR})
+for few-shot affect inference. MT-CLAR combines multi-task learning with a
+Siamese network trained via contrastive learning to infer from a pair of
+expressive facial images (a) the (dis)similarity between the facial
+expressions, and (b) the difference in valence and arousal levels of the two
+faces. We further extend the image-based MT-CLAR framework for automated video
+labelling where, given one or a few labelled video frames (termed
+\textit{support-set}), MT-CLAR labels the remainder of the video for valence
+and arousal. Experiments are performed on the AFEW-VA dataset with multiple
+support-set configurations; moreover, supervised learning on representations
+learnt via MT-CLAR are used for valence, arousal and categorical emotion
+prediction on the AffectNet and AFEW-VA datasets. The results show that valence
+and arousal predictions via MT-CLAR are very comparable to the state-of-the-art
+(SOTA), and we significantly outperform SOTA with a support-set $\approx$6\%
+the size of the video dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, to be published in Proceedings of the 31st ACM
+  International Conference on Multimedia (MM '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Referring Video Object Segmentation from Weak Annotation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wangbo Zhao, Kepan Nan, Songyang Zhang, Kai Chen, Dahua Lin, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring video object segmentation (RVOS) is a task that aims to segment the
+target object in all video frames based on a sentence describing the object.
+Previous RVOS methods have achieved significant performance with
+densely-annotated datasets, whose construction is expensive and time-consuming.
+To relieve the burden of data annotation while maintaining sufficient
+supervision for segmentation, we propose a new annotation scheme, in which we
+label the frame where the object first appears with a mask and use bounding
+boxes for the subsequent frames. Based on this scheme, we propose a method to
+learn from this weak annotation. Specifically, we design a cross frame
+segmentation method, which uses the language-guided dynamic filters to
+thoroughly leverage the valuable mask annotation and bounding boxes. We further
+develop a bi-level contrastive learning method to encourage the model to learn
+discriminative representation at the pixel level. Extensive experiments and
+ablative analyses show that our method is able to achieve competitive
+performance without the demand of dense mask annotation. The code will be
+available at https://github.com/wangbo-zhao/WRVOS/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M2Former: Multi-Scale Patch Selection for Fine-Grained Visual
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyong Moon, Junseok Lee, Yunju Lee, Seongsik Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, vision Transformers (ViTs) have been actively applied to
+fine-grained visual recognition (FGVR). ViT can effectively model the
+interdependencies between patch-divided object regions through an inherent
+self-attention mechanism. In addition, patch selection is used with ViT to
+remove redundant patch information and highlight the most discriminative object
+patches. However, existing ViT-based FGVR models are limited to single-scale
+processing, and their fixed receptive fields hinder representational richness
+and exacerbate vulnerability to scale variability. Therefore, we propose
+multi-scale patch selection (MSPS) to improve the multi-scale capabilities of
+existing ViT-based models. Specifically, MSPS selects salient patches of
+different scales at different stages of a multi-scale vision Transformer
+(MS-ViT). In addition, we introduce class token transfer (CTT) and multi-scale
+cross-attention (MSCA) to model cross-scale interactions between selected
+multi-scale patches and fully reflect them in model decisions. Compared to
+previous single-scale patch selection (SSPS), our proposed MSPS encourages
+richer object representations based on feature hierarchy and consistently
+improves performance from small-sized to large-sized objects. As a result, we
+propose M2Former, which outperforms CNN-/ViT-based models on several widely
+used FGVR benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CTP-Net: Character Texture Perception Network for Document Image Forgery
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Liao, Siliang Chen, Jiaxin Chen, Tianyi Wang, Xiehua Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the progression of information technology in recent years, document
+images have been widely disseminated in social networks. With the help of
+powerful image editing tools, document images are easily forged without leaving
+visible manipulation traces, which leads to severe issues if significant
+information is falsified for malicious use. Therefore, the research of document
+image forensics is worth further exploring. In a document image, the character
+with specific semantic information is most vulnerable to tampering, for which
+capturing the forgery traces of the character is the key to localizing the
+forged region in document images. Considering both character and image
+textures, in this paper, we propose a Character Texture Perception Network
+(CTP-Net) to localize the forgery of document images. Based on optical
+character recognition, a Character Texture Stream (CTS) is designed to capture
+features of text areas that are essential components of a document image.
+Meanwhile, texture features of the whole document image are exploited by an
+Image Texture Stream (ITS). Combining the features extracted from the CTS and
+the ITS, the CTP-Net can reveal more subtle forgery traces from document
+images. To overcome the challenge caused by the lack of fake document images,
+we design a data generation strategy that is utilized to construct a Fake
+Chinese Trademark dataset (FCTM). Through a series of experiments, we show that
+the proposed CTP-Net is able to capture tampering traces in document images,
+especially in text regions. Experimental results demonstrate that CTP-Net can
+localize multi-scale forged areas in document images and outperform the
+state-of-the-art forgery localization methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SDDM: Score-Decomposed Diffusion Models on Manifolds for Unpaired
+  Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shikun Sun, Longhui Wei, Junliang Xing, Jia Jia, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent score-based diffusion models (SBDMs) show promising results in
+unpaired image-to-image translation (I2I). However, existing methods, either
+energy-based or statistically-based, provide no explicit form of the interfered
+intermediate generative distributions. This work presents a new
+score-decomposed diffusion model (SDDM) on manifolds to explicitly optimize the
+tangled distributions during image generation. SDDM derives manifolds to make
+the distributions of adjacent time steps separable and decompose the score
+function or energy guidance into an image ``denoising" part and a content
+``refinement" part. To refine the image in the same noise level, we equalize
+the refinement parts of the score function and energy guidance, which permits
+multi-objective optimization on the manifold. We also leverage the block
+adaptive instance normalization module to construct manifolds with lower
+dimensions but still concentrated with the perturbed reference image. SDDM
+outperforms existing SBDM-based methods with much fewer diffusion steps on
+several I2I benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust <span class="highlight-title">Self-Supervised</span> Extrinsic Self-Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takayuki Kanai, Igor Vasiljevic, Vitor Guizilini, Adrien Gaidon, Rares Ambrus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles and robots need to operate over a wide variety of
+scenarios in order to complete tasks efficiently and safely. Multi-camera
+self-supervised monocular depth estimation from videos is a promising way to
+reason about the environment, as it generates metrically scaled geometric
+predictions from visual data without requiring additional sensors. However,
+most works assume well-calibrated extrinsics to fully leverage this
+multi-camera setup, even though accurate and efficient calibration is still a
+challenging problem. In this work, we introduce a novel method for extrinsic
+calibration that builds upon the principles of self-supervised monocular depth
+and ego-motion learning. Our proposed curriculum learning strategy uses
+monocular depth and pose estimators with velocity supervision to estimate
+extrinsics, and then jointly learns extrinsic calibration along with depth and
+pose for a set of overlapping cameras rigidly attached to a moving vehicle.
+Experiments on a benchmark multi-camera dataset (DDAD) demonstrate that our
+method enables self-calibration in various scenes robustly and efficiently
+compared to a traditional vision-based pose estimation pipeline. Furthermore,
+we demonstrate the benefits of extrinsics self-calibration as a way to improve
+depth prediction via joint optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sites.google.com/tri.global/tri-sesc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-Driven Lightweight Model for Pigmented Skin Lesion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzhe Hu, Xiaofeng Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a lightweight pipeline for skin lesion detection,
+addressing the challenges posed by imbalanced class distribution and subtle or
+atypical appearances of some lesions. The pipeline is built around a
+lightweight model that leverages ghosted features and the DFC attention
+mechanism to reduce computational complexity while maintaining high
+performance. The model was trained on the HAM10000 dataset, which includes
+various types of skin lesions. To address the class imbalance in the dataset,
+the synthetic minority over-sampling technique and various image augmentation
+techniques were used. The model also incorporates a knowledge-based loss
+weighting technique, which assigns different weights to the loss function at
+the class level and the instance level, helping the model focus on minority
+classes and challenging samples. This technique involves assigning different
+weights to the loss function on two levels - the class level and the instance
+level. By applying appropriate loss weights, the model pays more attention to
+the minority classes and challenging samples, thus improving its ability to
+correctly detect and classify different skin lesions. The model achieved an
+accuracy of 92.4%, a precision of 84.2%, a recall of 86.9%, a f1-score of 85.4%
+with particularly strong performance in identifying Benign Keratosis-like
+lesions (BKL) and Nevus (NV). Despite its superior performance, the model's
+computational cost is considerably lower than some models with less accuracy,
+making it an optimal solution for real-world applications where both accuracy
+and efficiency are essential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Class Activation Maps for Segmentation: Revealing Semantic
+  Information in Shallow Layers by Reducing Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang-Cheng Dong, Yuhao Jiang, Yingyan Huang, Jingxiao Liao, Bingguo Liu, Dong Ye, Guodong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class activation maps are widely used for explaining deep neural networks.
+Due to its ability to highlight regions of interest, it has evolved in recent
+years as a key step in weakly supervised learning. A major limitation to the
+performance of the class activation maps is the small spatial resolution of the
+feature maps in the last layer of the convolutional neural network. Therefore,
+we expect to generate high-resolution feature maps that result in high-quality
+semantic information. In this paper, we rethink the properties of semantic
+information in shallow feature maps. We find that the shallow feature maps
+still have fine-grained non-discriminative features while mixing considerable
+non-target noise. Furthermore, we propose a simple gradient-based denoising
+method to filter the noise by truncating the positive gradient. Our proposed
+scheme can be easily deployed in other CAM-related methods, facilitating these
+methods to obtain higher-quality class activation maps. We evaluate the
+proposed approach through a weakly-supervised semantic segmentation task, and a
+large number of experiments demonstrate the effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VQGraph: Graph Vector-Quantization for Bridging GNNs and MLPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yang, Ye Tian, Minkai Xu, Zhongyi Liu, Shenda Hong, Wei Qu, Wentao Zhang, Bin Cui, Muhan Zhang, Jure Leskovec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) conduct message passing which aggregates local
+neighbors to update node representations. Such message passing leads to
+scalability issues in practical latency-constrained applications. To address
+this issue, recent methods adopt knowledge distillation (KD) to learn
+computationally-efficient multi-layer perceptron (MLP) by mimicking the output
+of GNN. However, the existing GNN representation space may not be expressive
+enough for representing diverse local structures of the underlying graph, which
+limits the knowledge transfer from GNN to MLP. Here we present a novel
+framework VQGraph to learn a powerful graph representation space for bridging
+GNNs and MLPs. We adopt the encoder of a variant of a vector-quantized
+variational autoencoder (VQ-VAE) as a structure-aware graph tokenizer, which
+explicitly represents the nodes of diverse local structures as numerous
+discrete tokens and constitutes a meaningful codebook. Equipped with the
+learned codebook, we propose a new token-based distillation objective based on
+soft token assignments to sufficiently transfer the structural knowledge from
+GNN to MLP. Extensive experiments and analyses demonstrate the strong
+performance of VQGraph, where we achieve new state-of-the-art performance on
+GNN-MLP distillation in both transductive and inductive settings across seven
+graph datasets. We show that VQGraph with better performance infers faster than
+GNNs by 828x, and also achieves accuracy improvement over GNNs and stand-alone
+MLPs by 3.90% and 28.05% on average, respectively. Code:
+https://github.com/YangLing0818/VQGraph.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1906.00446 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdvFAS: A robust face anti-spoofing framework against adversarial
+  examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Chen, Xiao Yang, Heng Yin, Mingzhi Ma, Bihui Chen, Jianteng Peng, Yandong Guo, Zhaoxia Yin, Hang Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the reliability of face recognition systems against presentation
+attacks necessitates the deployment of face anti-spoofing techniques. Despite
+considerable advancements in this domain, the ability of even the most
+state-of-the-art methods to defend against adversarial examples remains
+elusive. While several adversarial defense strategies have been proposed, they
+typically suffer from constrained practicability due to inevitable trade-offs
+between universality, effectiveness, and efficiency. To overcome these
+challenges, we thoroughly delve into the coupled relationship between
+adversarial detection and face anti-spoofing. Based on this, we propose a
+robust face anti-spoofing framework, namely AdvFAS, that leverages two coupled
+scores to accurately distinguish between correctly detected and wrongly
+detected face images. Extensive experiments demonstrate the effectiveness of
+our framework in a variety of settings, including different attacks, datasets,
+and backbones, meanwhile enjoying high accuracy on clean examples. Moreover, we
+successfully apply the proposed method to detect real-world adversarial
+examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Breast Ultrasound Tumor Classification Using a Hybrid Multitask
+  CNN-<span class="highlight-title">Transformer</span> Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryar Shareef, Min Xian, Aleksandar Vakanski, Haotian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing global contextual information plays a critical role in breast
+ultrasound (BUS) image classification. Although convolutional neural networks
+(CNNs) have demonstrated reliable performance in tumor classification, they
+have inherent limitations for modeling global and long-range dependencies due
+to the localized nature of convolution operations. Vision Transformers have an
+improved capability of capturing global contextual information but may distort
+the local image patterns due to the tokenization operations. In this study, we
+proposed a hybrid multitask deep neural network called Hybrid-MT-ESTAN,
+designed to perform BUS tumor classification and segmentation using a hybrid
+architecture composed of CNNs and Swin Transformer components. The proposed
+approach was compared to nine BUS classification methods and evaluated using
+seven quantitative metrics on a dataset of 3,320 BUS images. The results
+indicate that Hybrid-MT-ESTAN achieved the highest accuracy, sensitivity, and
+F1 score of 82.7%, 86.4%, and 86.0%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CT Reconstruction from Few Planar X-rays with Application towards
+  Low-resource Radiotherapy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiran Sun, Tucker Netherton, Laurence Court, Ashok Veeraraghavan, Guha Balakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CT scans are the standard-of-care for many clinical ailments, and are needed
+for treatments like external beam radiotherapy. Unfortunately, CT scanners are
+rare in low and mid-resource settings due to their costs. Planar X-ray
+radiography units, in comparison, are far more prevalent, but can only provide
+limited 2D observations of the 3D anatomy. In this work, we propose a method to
+generate CT volumes from few (<5) planar X-ray observations using a prior data
+distribution, and perform the first evaluation of such a reconstruction
+algorithm for a clinical application: radiotherapy planning. We propose a deep
+generative model, building on advances in neural implicit representations to
+synthesize volumetric CT scans from few input planar X-ray images at different
+angles. To focus the generation task on clinically-relevant features, our model
+can also leverage anatomical guidance during training (via segmentation masks).
+We generated 2-field opposed, palliative radiotherapy plans on thoracic CTs
+reconstructed by our method, and found that isocenter radiation dose on
+reconstructed scans have <1% error with respect to the dose calculated on
+clinically acquired CTs using <=4 X-ray views. In addition, our method is
+better than recent sparse CT reconstruction baselines in terms of standard
+pixel and structure-level metrics (PSNR, SSIM, Dice score) on the public LIDC
+lung CT dataset. Code is available at: https://github.com/wanderinrain/Xray2CT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-interactive Feature Learning and a Full-time Multi-modality
+  Benchmark for Image Fusion and Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyuan Liu, Zhu Liu, Guanyao Wu, Long Ma, Risheng Liu, Wei Zhong, Zhongxuan Luo, Xin Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modality image fusion and segmentation play a vital role in autonomous
+driving and robotic operation. Early efforts focus on boosting the performance
+for only one task, \emph{e.g.,} fusion or segmentation, making it hard to
+reach~`Best of Both Worlds'. To overcome this issue, in this paper, we propose
+a \textbf{M}ulti-\textbf{i}nteractive \textbf{F}eature learning architecture
+for image fusion and \textbf{Seg}mentation, namely SegMiF, and exploit
+dual-task correlation to promote the performance of both tasks. The SegMiF is
+of a cascade structure, containing a fusion sub-network and a commonly used
+segmentation sub-network. By slickly bridging intermediate features between two
+components, the knowledge learned from the segmentation task can effectively
+assist the fusion task. Also, the benefited fusion network supports the
+segmentation one to perform more pretentiously. Besides, a hierarchical
+interactive attention block is established to ensure fine-grained mapping of
+all the vital information between two tasks, so that the modality/semantic
+features can be fully mutual-interactive. In addition, a dynamic weight factor
+is introduced to automatically adjust the corresponding weights of each task,
+which can balance the interactive feature correspondence and break through the
+limitation of laborious tuning. Furthermore, we construct a smart multi-wave
+binocular imaging system and collect a full-time multi-modality benchmark with
+15 annotated pixel-level categories for image fusion and segmentation.
+Extensive experiments on several public datasets and our benchmark demonstrate
+that the proposed method outputs visually appealing fused images and perform
+averagely $7.66\%$ higher segmentation mIoU in the real-world scene than the
+state-of-the-art approaches. The source code and benchmark are available at
+\url{https://github.com/JinyuanLiu-CV/SegMiF}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. The source code and benchmark are available at
+  https://github.com/JinyuanLiu-CV/SegMiF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounded Image Text Matching with Mismatched Relation Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Wu, Yana Wei, Haozhe Wang, Yongfei Liu, Sibei Yang, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Grounded Image Text Matching with Mismatched Relation
+(GITM-MR), a novel visual-linguistic joint task that evaluates the relation
+understanding capabilities of transformer-based pre-trained models. GITM-MR
+requires a model to first determine if an expression describes an image, then
+localize referred objects or ground the mismatched parts of the text. We
+provide a benchmark for evaluating pre-trained models on this task, with a
+focus on the challenging settings of limited data and out-of-distribution
+sentence lengths. Our evaluation demonstrates that pre-trained models lack data
+efficiency and length generalization ability. To address this, we propose the
+Relation-sensitive Correspondence Reasoning Network (RCRN), which incorporates
+relation-aware reasoning via bi-directional message propagation guided by
+language structure. RCRN can be interpreted as a modular program and delivers
+strong performance in both length generalization and data efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BAA-NGP: Bundle-Adjusting Accelerated Neural Graphics Primitives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04166v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04166v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sainan Liu, Shan Lin, Jingpei Lu, Shreya Saha, Alexey Supikov, Michael Yip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representation has emerged as a powerful method for
+reconstructing 3D scenes from 2D images. Given a set of camera poses and
+associated images, the models can be trained to synthesize novel, unseen views.
+In order to expand the use cases for implicit neural representations, we need
+to incorporate camera pose estimation capabilities as part of the
+representation learning, as this is necessary for reconstructing scenes from
+real-world video sequences where cameras are generally not being tracked.
+Existing approaches like COLMAP and, most recently, bundle-adjusting neural
+radiance field methods often suffer from lengthy processing times. These delays
+ranging from hours to days, arise from laborious feature matching, hardware
+limitations, dense point sampling, and long training times required by a
+multi-layer perceptron structure with a large number of parameters. To address
+these challenges, we propose a framework called bundle-adjusting accelerated
+neural graphics primitives (BAA-NGP). Our approach leverages accelerated
+sampling and hash encoding to expedite both pose refinement/estimation and 3D
+scene reconstruction. Experimental results demonstrate that our method achieves
+a more than 10 to 20 $\times$ speed improvement in novel view synthesis
+compared to other bundle-adjusting neural radiance field methods without
+sacrificing the quality of pose estimation. The github repository can be found
+here https://github.com/IntelLabs/baa-ngp.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Clinical Support for Breast Cancer with Deep Learning Models
+  using Synthetic Correlated Diffusion Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.05308v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.05308v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi-en Amy Tai, Hayden Gunraj, Nedim Hodzic, Nic Flanagan, Ali Sabri, Alexander Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is the second most common type of cancer in women in Canada and
+the United States, representing over 25\% of all new female cancer cases. As
+such, there has been immense research and progress on improving screening and
+clinical support for breast cancer. In this paper, we investigate enhancing
+clinical support for breast cancer with deep learning models using a newly
+introduced magnetic resonance imaging (MRI) modality called synthetic
+correlated diffusion imaging (CDI$^s$). More specifically, we leverage a
+volumetric convolutional neural network to learn volumetric deep radiomic
+features from a pre-treatment cohort and construct a predictor based on the
+learnt features for grade and post-treatment response prediction. As the first
+study to learn CDI$^s$-centric radiomic sequences within a deep learning
+perspective for clinical decision support, we evaluated the proposed approach
+using the ACRIN-6698 study against those learnt using gold-standard imaging
+modalities. We find that the proposed approach can achieve better performance
+for both grade and post-treatment response prediction and thus may be a useful
+tool to aid oncologists in improving recommendation of treatment of patients.
+Subsequently, the approach to leverage volumetric deep radiomic features for
+breast cancer can be further extended to other applications of CDI$^s$ in the
+cancer domain to further improve clinical support.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beating Backdoor Attack at Its Own Game <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15539v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15539v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Alberto Sangiovanni-Vincentelli, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not
+affect the network's performance on clean data but would manipulate the network
+behavior once a trigger pattern is added. Existing defense methods have greatly
+reduced attack success rate, but their prediction accuracy on clean data still
+lags behind a clean model by a large margin. Inspired by the stealthiness and
+effectiveness of backdoor attack, we propose a simple but highly effective
+defense framework which injects non-adversarial backdoors targeting poisoned
+samples. Following the general steps in backdoor attack, we detect a small set
+of suspected samples and then apply a poisoning strategy to them. The
+non-adversarial backdoor, once triggered, suppresses the attacker's backdoor on
+poisoned data, but has limited influence on clean data. The defense can be
+carried out during data preprocessing, without any modification to the standard
+end-to-end training pipeline. We conduct extensive experiments on multiple
+benchmarks with different architectures and representative attacks. Results
+demonstrate that our method achieves state-of-the-art defense effectiveness
+with by far the lowest performance drop on clean data. Considering the
+surprising defense ability displayed by our framework, we call for more
+attention to utilizing backdoor for backdoor defense. Code is available at
+https://github.com/damianliumin/non-adversarial_backdoor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating the Bias of Centered Objects in Common <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.09195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.09195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gergely Szabo, Andras Horvath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional networks are considered shift invariant, but it was
+demonstrated that their response may vary according to the exact location of
+the objects. In this paper we will demonstrate that most commonly investigated
+datasets have a bias, where objects are over-represented at the center of the
+image during training. This bias and the boundary condition of these networks
+can have a significant effect on the performance of these architectures and
+their accuracy drops significantly as an object approaches the boundary. We
+will also demonstrate how this effect can be mitigated with data augmentation
+techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video Background Music Generation: <span class="highlight-title">Dataset</span>, Method and Evaluation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Zhuo, Zhaokai Wang, Baisen Wang, Yue Liao, Chenxi Bao, Stanley Peng, Songhao Han, Aixi Zhang, Fei Fang, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music is essential when editing videos, but selecting music manually is
+difficult and time-consuming. Thus, we seek to automatically generate
+background music tracks given video input. This is a challenging task since it
+requires music-video datasets, efficient architectures for video-to-music
+generation, and reasonable metrics, none of which currently exist. To close
+this gap, we introduce a complete recipe including dataset, benchmark model,
+and evaluation metric for video background music generation. We present SymMV,
+a video and symbolic music dataset with various musical annotations. To the
+best of our knowledge, it is the first video-music dataset with rich musical
+annotations. We also propose a benchmark video background music generation
+framework named V-MusProd, which utilizes music priors of chords, melody, and
+accompaniment along with video-music relations of semantic, color, and motion
+features. To address the lack of objective metrics for video-music
+correspondence, we design a retrieval-based metric VMCP built upon a powerful
+video-music representation learning model. Experiments show that with our
+dataset, V-MusProd outperforms the state-of-the-art method in both music
+quality and correspondence with videos. We believe our dataset, benchmark
+model, and evaluation metric will boost the development of video background
+music generation. Our dataset and code are available at
+https://github.com/zhuole1025/SymMV.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A closer look at the training dynamics of knowledge distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11098v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11098v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roy Miles, Krystian Mikolajczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we revisit the efficacy of knowledge distillation as a function
+matching and metric learning problem. In doing so we verify three important
+design decisions, namely the normalisation, soft maximum function, and
+projection layers as key ingredients. We theoretically show that the projector
+implicitly encodes information on past examples, enabling relational gradients
+for the student. We then show that the normalisation of representations is
+tightly coupled with the training dynamics of this projector, which can have a
+large impact on the students performance. Finally, we show that a simple soft
+maximum function can be used to address any significant capacity gap problems.
+Experimental results on various benchmark datasets demonstrate that using these
+insights can lead to superior or comparable performance to state-of-the-art
+knowledge distillation techniques, despite being much more computationally
+efficient. In particular, we obtain these results across image classification
+(CIFAR100 and ImageNet), object detection (COCO2017), and on more difficult
+distillation objectives, such as training data efficient transformers, whereby
+we attain a 77.2% top-1 accuracy with DeiT-Ti on ImageNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ X-Mesh: Towards Fast and Accurate Text-driven 3D Stylization via Dynamic
+  Textual Guidance <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwei Ma, Xiaioqing Zhang, Xiaoshuai Sun, Jiayi Ji, Haowei Wang, Guannan Jiang, Weilin Zhuang, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven 3D stylization is a complex and crucial task in the fields of
+computer vision (CV) and computer graphics (CG), aimed at transforming a bare
+mesh to fit a target text. Prior methods adopt text-independent multilayer
+perceptrons (MLPs) to predict the attributes of the target mesh with the
+supervision of CLIP loss. However, such text-independent architecture lacks
+textual guidance during predicting attributes, thus leading to unsatisfactory
+stylization and slow convergence. To address these limitations, we present
+X-Mesh, an innovative text-driven 3D stylization framework that incorporates a
+novel Text-guided Dynamic Attention Module (TDAM). The TDAM dynamically
+integrates the guidance of the target text by utilizing text-relevant spatial
+and channel-wise attentions during vertex feature extraction, resulting in more
+accurate attribute prediction and faster convergence speed. Furthermore,
+existing works lack standard benchmarks and automated metrics for evaluation,
+often relying on subjective and non-reproducible user studies to assess the
+quality of stylized 3D assets. To overcome this limitation, we introduce a new
+standard text-mesh benchmark, namely MIT-30, and two automated metrics, which
+will enable future research to achieve fair and objective comparisons. Our
+extensive qualitative and quantitative experiments demonstrate that X-Mesh
+outperforms previous state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures, ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-grained Species Recognition with Privileged Pooling: Better Sample
+  Efficiency Through Supervised Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.09168v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.09168v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres C. Rodriguez, Stefano D'Aronco, Konrad Schindler, Jan Dirk Wegner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a scheme for supervised image classification that uses privileged
+information, in the form of keypoint annotations for the training data, to
+learn strong models from small and/or biased training sets. Our main motivation
+is the recognition of animal species for ecological applications such as
+biodiversity modelling, which is challenging because of long-tailed species
+distributions due to rare species, and strong dataset biases such as repetitive
+scene background in camera traps. To counteract these challenges, we propose a
+visual attention mechanism that is supervised via keypoint annotations that
+highlight important object parts. This privileged information, implemented as a
+novel privileged pooling operation, is only required during training and helps
+the model to focus on regions that are discriminative. In experiments with
+three different animal species datasets, we show that deep networks with
+privileged pooling can use small training sets more efficiently and generalize
+better.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated version with iNaturalist2018 dataset. privileged pooling,
+  supervised attention, training set bias, fine-grained species recognition,
+  camera trap images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SVCNet: Scribble-based Video Colorization Network with Temporal
+  Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhi Zhao, Lai-Man Po, Kangcheng Liu, Xuehui Wang, Wing-Yin Yu, Pengfei Xian, Yujia Zhang, Mengyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a scribble-based video colorization network with
+temporal aggregation called SVCNet. It can colorize monochrome videos based on
+different user-given color scribbles. It addresses three common issues in the
+scribble-based video colorization area: colorization vividness, temporal
+consistency, and color bleeding. To improve the colorization quality and
+strengthen the temporal consistency, we adopt two sequential sub-networks in
+SVCNet for precise colorization and temporal smoothing, respectively. The first
+stage includes a pyramid feature encoder to incorporate color scribbles with a
+grayscale frame, and a semantic feature encoder to extract semantics. The
+second stage finetunes the output from the first stage by aggregating the
+information of neighboring colorized frames (as short-range connections) and
+the first colorized frame (as a long-range connection). To alleviate the color
+bleeding artifacts, we learn video colorization and segmentation
+simultaneously. Furthermore, we set the majority of operations on a fixed small
+image resolution and use a Super-resolution Module at the tail of SVCNet to
+recover original sizes. It allows the SVCNet to fit different image resolutions
+at the inference. Finally, we evaluate the proposed SVCNet on DAVIS and Videvo
+benchmarks. The experimental results demonstrate that SVCNet produces both
+higher-quality and more temporally consistent videos than other well-known
+video colorization approaches. The codes and models can be found at
+https://github.com/zhaoyuzhi/SVCNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by IEEE Transactions on Image Processing (TIP)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do We Train on Test Data? The Impact of Near-Duplicates on License Plate
+  Recognition <span class="chip">IJCNN</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rayson Laroca, Valter Estevam, Alceu S. Britto Jr., Rodrigo Minetto, David Menotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work draws attention to the large fraction of near-duplicates in the
+training and test sets of datasets widely adopted in License Plate Recognition
+(LPR) research. These duplicates refer to images that, although different, show
+the same license plate. Our experiments, conducted on the two most popular
+datasets in the field, show a substantial decrease in recognition rate when six
+well-known models are trained and tested under fair splits, that is, in the
+absence of duplicates in the training and test sets. Moreover, in one of the
+datasets, the ranking of models changed considerably when they were trained and
+tested under duplicate-free splits. These findings suggest that such duplicates
+have significantly biased the evaluation and development of deep learning-based
+models for LPR. The list of near-duplicates we have found and proposals for
+fair splits are publicly available for further research at
+https://raysonlaroca.github.io/supp/lpr-train-on-test/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the International Joint Conference on
+  Neural Networks (IJCNN) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling Multi-view Representations Beyond Inductive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01634v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01634v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanzhou Ke, Yang Yu, Guoqing Chao, Xiaoli Wang, Chenyang Xu, Shengfeng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view (or -modality) representation learning aims to understand the
+relationships between different view representations. Existing methods
+disentangle multi-view representations into consistent and view-specific
+representations by introducing strong inductive biases, which can limit their
+generalization ability. In this paper, we propose a novel multi-view
+representation disentangling method that aims to go beyond inductive biases,
+ensuring both interpretability and generalizability of the resulting
+representations. Our method is based on the observation that discovering
+multi-view consistency in advance can determine the disentangling information
+boundary, leading to a decoupled learning objective. We also found that the
+consistency can be easily extracted by maximizing the transformation invariance
+and clustering consistency between views. These observations drive us to
+propose a two-stage framework. In the first stage, we obtain multi-view
+consistency by training a consistent encoder to produce semantically-consistent
+representations across views as well as their corresponding pseudo-labels. In
+the second stage, we disentangle specificity from comprehensive representations
+by minimizing the upper bound of mutual information between consistent and
+comprehensive representations. Finally, we reconstruct the original data by
+concatenating pseudo-labels and view-specific representations. Our experiments
+on four multi-view datasets demonstrate that our proposed method outperforms 12
+comparison methods in terms of clustering and classification performance. The
+visualization results also show that the extracted consistency and specificity
+are compact and interpretable. Our code can be found at
+\url{https://github.com/Guanzhou-Ke/DMRIB}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Clustering-guided Contrastive Fusion for Multi-view Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13726v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13726v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanzhou Ke, Guoqing Chao, Xiaoli Wang, Chenyang Xu, Yongqi Zhu, Yang Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past two decades have seen increasingly rapid advances in the field of
+multi-view representation learning due to it extracting useful information from
+diverse domains to facilitate the development of multi-view applications.
+However, the community faces two challenges: i) how to learn robust
+representations from a large amount of unlabeled data to against noise or
+incomplete views setting, and ii) how to balance view consistency and
+complementary for various downstream tasks. To this end, we utilize a deep
+fusion network to fuse view-specific representations into the view-common
+representation, extracting high-level semantics for obtaining robust
+representation. In addition, we employ a clustering task to guide the fusion
+network to prevent it from leading to trivial solutions. For balancing
+consistency and complementary, then, we design an asymmetrical contrastive
+strategy that aligns the view-common representation and each view-specific
+representation. These modules are incorporated into a unified method known as
+CLustering-guided cOntrastiVE fusioN (CLOVEN). We quantitatively and
+qualitatively evaluate the proposed method on five datasets, demonstrating that
+CLOVEN outperforms 11 competitive multi-view learning methods in clustering and
+classification. In the incomplete view scenario, our proposed method resists
+noise interference better than those of our competitors. Furthermore, the
+visualization analysis shows that CLOVEN can preserve the intrinsic structure
+of view-specific representation while also improving the compactness of
+view-commom representation. Our source code will be available soon at
+https://github.com/guanzhou-ke/cloven.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-based Person Search without Parallel Image-Text Data <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Bai, Jingyao Wang, Min Cao, Chen Chen, Ziqiang Cao, Liqiang Nie, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-based person search (TBPS) aims to retrieve the images of the target
+person from a large image gallery based on a given natural language
+description. Existing methods are dominated by training models with parallel
+image-text pairs, which are very costly to collect. In this paper, we make the
+first attempt to explore TBPS without parallel image-text data ($\mu$-TBPS), in
+which only non-parallel images and texts, or even image-only data, can be
+adopted. Towards this end, we propose a two-stage framework,
+generation-then-retrieval (GTR), to first generate the corresponding pseudo
+text for each image and then perform the retrieval in a supervised manner. In
+the generation stage, we propose a fine-grained image captioning strategy to
+obtain an enriched description of the person image, which firstly utilizes a
+set of instruction prompts to activate the off-the-shelf pretrained
+vision-language model to capture and generate fine-grained person attributes,
+and then converts the extracted attributes into a textual description via the
+finetuned large language model or the hand-crafted template. In the retrieval
+stage, considering the noise interference of the generated texts for training
+model, we develop a confidence score-based training scheme by enabling more
+reliable texts to contribute more during the training. Experimental results on
+multiple TBPS benchmarks (i.e., CUHK-PEDES, ICFG-PEDES and RSTPReid) show that
+the proposed GTR can achieve a promising performance without relying on
+parallel image-text data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Reasoning Integrated Label Noise Robust Deep Image
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01261v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01261v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gencer Sumbul, Begüm Demir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of deep learning based image representation learning (IRL)
+methods has attracted great attention for various image understanding problems.
+Most of these methods require the availability of a high quantity and quality
+of annotated training images, which can be time-consuming and costly to gather.
+To reduce labeling costs, crowdsourced data, automatic labeling procedures or
+citizen science projects can be considered. However, such approaches increase
+the risk of including label noise in training data. It may result in
+overfitting on noisy labels when discriminative reasoning is employed. This
+leads to sub-optimal learning procedures, and thus inaccurate characterization
+of images. To address this, we introduce a generative reasoning integrated
+label noise robust deep representation learning (GRID) approach. Our approach
+aims to model the complementary characteristics of discriminative and
+generative reasoning for IRL under noisy labels. To this end, we first
+integrate generative reasoning into discriminative reasoning through a
+supervised variational autoencoder. This allows GRID to automatically detect
+training samples with noisy labels. Then, through our label noise robust hybrid
+representation learning strategy, GRID adjusts the whole learning procedure for
+IRL of these samples through generative reasoning and that of other samples
+through discriminative reasoning. Our approach learns discriminative image
+representations while preventing interference of noisy labels independently
+from the IRL method being selected. Thus, unlike the existing methods, GRID
+does not depend on the type of annotation, neural network architecture, loss
+function or learning task, and thus can be directly utilized for various
+problems. Experimental results show its effectiveness compared to
+state-of-the-art methods. The code of GRID is publicly available at
+https://github.com/gencersumbul/GRID.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the IEEE Transactions on Image Processing. Our code is
+  available at https://github.com/gencersumbul/GRID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in
+  Patients With Suspected Ischemic Stroke 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Tomasetti, Kjersti Engan, Liv Jorunn Høllesli, Kathinka Dæhli Kurz, Mahdieh Khanmohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise and fast prediction methods for ischemic areas comprised of dead
+tissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)
+patients are of significant clinical interest. They play an essential role in
+improving diagnosis and treatment planning. Computed Tomography (CT) scan is
+one of the primary modalities for early assessment in patients with suspected
+AIS. CT Perfusion (CTP) is often used as a primary assessment to determine
+stroke location, severity, and volume of ischemic lesions. Current automatic
+segmentation methods for CTP mostly use already processed 3D parametric maps
+conventionally used for clinical interpretation by radiologists as input.
+Alternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time
+input, where the spatial information over the volume is ignored. In addition,
+these methods are only interested in segmenting core regions, while predicting
+penumbra can be essential for treatment planning. This paper investigates
+different methods to utilize the entire 4D CTP as input to fully exploit the
+spatio-temporal information, leading us to propose a novel 4D convolution
+layer. Our comprehensive experiments on a local dataset of 152 patients divided
+into three groups show that our proposed models generate more precise results
+than other methods explored. Adopting the proposed 4D mJ-Net, a Dice
+Coefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core
+areas, respectively. The code is available on
+https://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RT-K-Net: Revisiting K-Net for Real-Time Panoptic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Schön, Michael Buchholz, Klaus Dietmayer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation is one of the most challenging scene parsing tasks,
+combining the tasks of semantic segmentation and instance segmentation. While
+much progress has been made, few works focus on the real-time application of
+panoptic segmentation methods. In this paper, we revisit the recently
+introduced K-Net architecture. We propose vital changes to the architecture,
+training, and inference procedure, which massively decrease latency and improve
+performance. Our resulting RT-K-Net sets a new state-of-the-art performance for
+real-time panoptic segmentation methods on the Cityscapes dataset and shows
+promising results on the challenging Mapillary Vistas dataset. On Cityscapes,
+RT-K-Net reaches 60.2 % PQ with an average inference time of 32 ms for full
+resolution 1024x2048 pixel images on a single Titan RTX GPU. On Mapillary
+Vistas, RT-K-Net reaches 33.2 % PQ with an average inference time of 69 ms.
+Source code is available at https://github.com/markusschoen/RT-K-Net.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSIVD-Net: A Novel Salient Super Image Classification & Detection
+  Technique for Weaponized Violence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12850v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12850v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toluwani Aremu, Li Zhiyuan, Reem Alameeri, Mustaqeem Khan, Abdulmotaleb El Saddik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detection of violence and weaponized violence in closed-circuit television
+(CCTV) footage requires a comprehensive approach. In this work, we introduce
+the \emph{Smart-City CCTV Violence Detection (SCVD)} dataset, specifically
+designed to facilitate the learning of weapon distribution in surveillance
+videos. To tackle the complexities of analyzing 3D surveillance video for
+violence recognition tasks, we propose a novel technique called,
+\emph{SSIVD-Net} (\textbf{S}alient-\textbf{S}uper-\textbf{I}mage for
+\textbf{V}iolence \textbf{D}etection). Our method reduces 3D video data
+complexity, dimensionality, and information loss while improving inference,
+performance, and explainability through the use of Salient-Super-Image
+representations. Considering the scalability and sustainability requirements of
+futuristic smart cities, the authors introduce the \emph{Salient-Classifier}, a
+novel architecture combining a kernelized approach with a residual learning
+strategy. We evaluate variations of SSIVD-Net and Salient Classifier on our
+SCVD dataset and benchmark against state-of-the-art (SOTA) models commonly
+employed in violence detection. Our approach exhibits significant improvements
+in detecting both weaponized and non-weaponized violence instances. By
+advancing the SOTA in violence detection, our work offers a practical and
+scalable solution suitable for real-world applications. The proposed
+methodology not only addresses the challenges of violence detection in CCTV
+footage but also contributes to the understanding of weapon distribution in
+smart surveillance. Ultimately, our research findings should enable smarter and
+more secure cities, as well as enhance public safety measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 tables, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jointly Optimizing Image Compression with Low-light Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilv Cai, Xu Zou, Liqun Chen, Luxin Yan, Sheng Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based image compression methods have made great progress. Most of
+them are designed for generic natural images. In fact, low-light images
+frequently occur due to unavoidable environmental influences or technical
+limitations, such as insufficient lighting or limited exposure time. %When
+general-purpose image compression algorithms compress low-light images, useful
+detail information is lost, resulting in a dramatic decrease in image
+enhancement. Once low-light images are compressed by existing general image
+compression approaches, useful information(e.g., texture details) would be lost
+resulting in a dramatic performance decrease in low-light image enhancement. To
+simultaneously achieve a higher compression rate and better enhancement
+performance for low-light images, we propose a novel image compression
+framework with joint optimization of low-light image enhancement. We design an
+end-to-end trainable two-branch architecture with lower computational cost,
+which includes the main enhancement branch and the signal-to-noise ratio~(SNR)
+aware branch. Experimental results show that our proposed joint optimization
+framework achieves a significant improvement over existing ``Compress before
+Enhance" or ``Enhance before Compress" sequential solutions for low-light
+images. Source codes are included in the supplementary material.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2303.06705 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-view Vision-<span class="highlight-title">Prompt</span> Fusion Network: Can 2D <span class="highlight-title">Pre-train</span>ed Model Boost
+  3D Point Cloud Data-scarce Learning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Peng, Baopu Li, Bo Zhang, Xin Chen, Tao Chen, Hongyuan Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud based 3D deep model has wide applications in many applications
+such as autonomous driving, house robot, and so on. Inspired by the recent
+prompt learning in natural language processing, this work proposes a novel
+Multi-view Vision-Prompt Fusion Network (MvNet) for few-shot 3D point cloud
+classification. MvNet investigates the possibility of leveraging the
+off-the-shelf 2D pre-trained models to achieve the few-shot classification,
+which can alleviate the over-dependence issue of the existing baseline models
+towards the large-scale annotated 3D point cloud data. Specifically, MvNet
+first encodes a 3D point cloud into multi-view image features for a number of
+different views. Then, a novel multi-view prompt fusion module is developed to
+effectively fuse information from different views to bridge the gap between 3D
+point cloud data and 2D pre-trained models. A set of 2D image prompts can then
+be derived to better describe the suitable prior knowledge for a large-scale
+pre-trained image model for few-shot 3D point cloud classification. Extensive
+experiments on ModelNet, ScanObjectNN, and ShapeNet datasets demonstrate that
+MvNet achieves new state-of-the-art performance for 3D few-shot point cloud
+image classification. The source code of this work will be available soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explore Spatio-temporal Aggregation for Insubstantial Object Detection:
+  Benchmark <span class="highlight-title">Dataset</span> and Baseline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11459v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11459v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kailai Zhou, Yibo Wang, Tao Lv, Yunqian Li, Linsen Chen, Qiu Shen, Xun Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We endeavor on a rarely explored task named Insubstantial Object Detection
+(IOD), which aims to localize the object with following characteristics: (1)
+amorphous shape with indistinct boundary; (2) similarity to surroundings; (3)
+absence in color. Accordingly, it is far more challenging to distinguish
+insubstantial objects in a single static frame and the collaborative
+representation of spatial and temporal information is crucial. Thus, we
+construct an IOD-Video dataset comprised of 600 videos (141,017 frames)
+covering various distances, sizes, visibility, and scenes captured by different
+spectral ranges. In addition, we develop a spatio-temporal aggregation
+framework for IOD, in which different backbones are deployed and a
+spatio-temporal aggregation loss (STAloss) is elaborately designed to leverage
+the consistency along the time axis. Experiments conducted on IOD-Video dataset
+demonstrate that spatio-temporal aggregation can significantly improve the
+performance of IOD. We hope our work will attract further researches into this
+valuable yet challenging task. The code will be available at:
+\url{https://github.com/CalayZhou/IOD-Video}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Visual-Inertial Deep Multimodal Fusion for Relative Pose
+  Regression and Odometry-aided Absolute Pose Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00919v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00919v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Ott, Nisha Lakshmana Raichur, David Rügamer, Tobias Feigl, Heiko Neumann, Bernd Bischl, Christopher Mutschler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual-inertial localization is a key problem in computer vision and robotics
+applications such as virtual reality, self-driving cars, and aerial vehicles.
+The goal is to estimate an accurate pose of an object when either the
+environment or the dynamics are known. Absolute pose regression (APR)
+techniques directly regress the absolute pose from an image input in a known
+scene using convolutional and spatio-temporal networks. Odometry methods
+perform relative pose regression (RPR) that predicts the relative pose from a
+known object dynamic (visual or inertial inputs). The localization task can be
+improved by retrieving information from both data sources for a cross-modal
+setup, which is a challenging problem due to contradictory tasks. In this work,
+we conduct a benchmark to evaluate deep multimodal fusion based on pose graph
+optimization and attention networks. Auxiliary and Bayesian learning are
+utilized for the APR task. We show accuracy improvements for the APR-RPR task
+and for the RPR-RPR task for aerial vehicles and hand-held devices. We conduct
+experiments on the EuRoC MAV and PennCOSYVIO datasets and record and evaluate a
+novel industry dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstructing Three-Dimensional Models of Interacting Humans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihai Fieraru, Mihai Zanfir, Elisabeta Oneata, Alin-Ionut Popa, Vlad Olaru, Cristian Sminchisescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding 3d human interactions is fundamental for fine-grained scene
+analysis and behavioural modeling. However, most of the existing models predict
+incorrect, lifeless 3d estimates, that miss the subtle human contact
+aspects--the essence of the event--and are of little use for detailed
+behavioral understanding. This paper addresses such issues with several
+contributions: (1) we introduce models for interaction signature estimation
+(ISP) encompassing contact detection, segmentation, and 3d contact signature
+prediction; (2) we show how such components can be leveraged to ensure contact
+consistency during 3d reconstruction; (3) we construct several large datasets
+for learning and evaluating 3d contact prediction and reconstruction methods;
+specifically, we introduce CHI3D, a lab-based accurate 3d motion capture
+dataset with 631 sequences containing $2,525$ contact events, $728,664$ ground
+truth 3d poses, as well as FlickrCI3D, a dataset of $11,216$ images, with
+$14,081$ processed pairs of people, and $81,233$ facet-level surface
+correspondences. Finally, (4) we propose methodology for recovering the
+ground-truth pose and shape of interacting people in a controlled setup and (5)
+annotate all 3d interaction motions in CHI3D with textual descriptions. Motion
+data in multiple formats (GHUM and SMPLX parameters, Human3.6m 3d joints) is
+made available for research purposes at \url{https://ci3d.imar.ro}, together
+with an evaluation server and a public benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Language and Geometric Primitives for Zero-shot Point Cloud
+  Segmentation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runnan Chen, Xinge Zhu, Nenglun Chen, Wei Li, Yuexin Ma, Ruigang Yang, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate transductive zero-shot point cloud semantic segmentation,
+where the network is trained on seen objects and able to segment unseen
+objects. The 3D geometric elements are essential cues to imply a novel 3D
+object type. However, previous methods neglect the fine-grained relationship
+between the language and the 3D geometric elements. To this end, we propose a
+novel framework to learn the geometric primitives shared in seen and unseen
+categories' objects and employ a fine-grained alignment between language and
+the learned geometric primitives. Therefore, guided by language, the network
+recognizes the novel objects represented with geometric primitives.
+Specifically, we formulate a novel point visual representation, the similarity
+vector of the point's feature to the learnable prototypes, where the prototypes
+automatically encode geometric primitives via back-propagation. Besides, we
+propose a novel Unknown-aware InfoNCE Loss to fine-grained align the visual
+representation with language. Extensive experiments show that our method
+significantly outperforms other state-of-the-art methods in the harmonic
+mean-intersection-over-union (hIoU), with the improvement of 17.8\%, 30.4\%,
+9.2\% and 7.9\% on S3DIS, ScanNet, SemanticKITTI and nuScenes datasets,
+respectively. Codes are available
+(https://github.com/runnanchen/Zero-Shot-Point-Cloud-Segmentation)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Damage Vision Mining Opportunity for Imbalanced Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yasuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In past decade, previous balanced datasets have been used to advance
+algorithms for classification, object detection, semantic segmentation, and
+anomaly detection in industrial applications. Specifically, for condition-based
+maintenance, automating visual inspection is crucial to ensure high quality.
+Deterioration prognostic attempts to optimize the fine decision process for
+predictive maintenance and proactive repair. In civil infrastructure and living
+environment, damage data mining cannot avoid the imbalanced data issue because
+of rare unseen events and high quality status by improved operations. For
+visual inspection, deteriorated class acquired from the surface of concrete and
+steel components are occasionally imbalanced. From numerous related surveys, we
+summarize that imbalanced data problems can be categorized into four types; 1)
+missing range of target and label valuables, 2) majority-minority class
+imbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class
+of pixel-wise imbalance. Since 2015, there has been many imbalanced studies
+using deep learning approaches that includes regression, image classification,
+object detection, semantic segmentation. However, anomaly detection for
+imbalanced data is not yet well known. In the study, we highlight one-class
+anomaly detection application whether anomalous class or not, and demonstrate
+clear examples on imbalanced vision datasets: blood smear, lung infection,
+wooden, concrete deterioration, and disaster damage. We provide key results on
+damage vision mining advantage, hypothesizing that the more effective range of
+positive ratio, the higher accuracy gain of anomaly detection application.
+Finally, the applicability of the damage learning methods, limitations, and
+future works are mentioned.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 20 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVFlow: Deep Optical Flow Estimation of Compressed Videos with Motion
+  Vector Prior <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shili Zhou, Xuhao Jiang, Weimin Tan, Ruian He, Bo Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, many deep learning-based methods have been proposed to
+tackle the problem of optical flow estimation and achieved promising results.
+However, they hardly consider that most videos are compressed and thus ignore
+the pre-computed information in compressed video streams. Motion vectors, one
+of the compression information, record the motion of the video frames. They can
+be directly extracted from the compression code stream without computational
+cost and serve as a solid prior for optical flow estimation. Therefore, we
+propose an optical flow model, MVFlow, which uses motion vectors to improve the
+speed and accuracy of optical flow estimation for compressed videos. In detail,
+MVFlow includes a key Motion-Vector Converting Module, which ensures that the
+motion vectors can be transformed into the same domain of optical flow and then
+be utilized fully by the flow estimation module. Meanwhile, we construct four
+optical flow datasets for compressed videos containing frames and motion
+vectors in pairs. The experimental results demonstrate the superiority of our
+proposed MVFlow, which can reduce the AEPE by 1.09 compared to existing models
+or save 52% time to achieve similar accuracy to existing models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio-Visual Deception Detection: DOLOS <span class="highlight-title">Dataset</span> and Parameter-Efficient
+  Crossmodal Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobao Guo, Nithish Muthuchamy Selvaraj, Zitong Yu, Adams Wai-Kin Kong, Bingquan Shen, Alex Kot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deception detection in conversations is a challenging yet important task,
+having pivotal applications in many fields such as credibility assessment in
+business, multimedia anti-frauds, and custom security. Despite this, deception
+detection research is hindered by the lack of high-quality deception datasets,
+as well as the difficulties of learning multimodal features effectively. To
+address this issue, we introduce DOLOS\footnote {The name ``DOLOS" comes from
+Greek mythology.}, the largest gameshow deception detection dataset with rich
+deceptive conversations. DOLOS includes 1,675 video clips featuring 213
+subjects, and it has been labeled with audio-visual feature annotations. We
+provide train-test, duration, and gender protocols to investigate the impact of
+different factors. We benchmark our dataset on previously proposed deception
+detection approaches. To further improve the performance by fine-tuning fewer
+parameters, we propose Parameter-Efficient Crossmodal Learning (PECL), where a
+Uniform Temporal Adapter (UT-Adapter) explores temporal attention in
+transformer-based architectures, and a crossmodal fusion module, Plug-in
+Audio-Visual Fusion (PAVF), combines crossmodal information from audio-visual
+features. Based on the rich fine-grained audio-visual annotations on DOLOS, we
+also exploit multi-task learning to enhance performance by concurrently
+predicting deception and audio-visual features. Experimental results
+demonstrate the desired quality of the DOLOS dataset and the effectiveness of
+the PECL. The DOLOS dataset and the source codes are available at
+https://github.com/NMS05/Audio-Visual-Deception-Detection-DOLOS-Dataset-and-Parameter-Efficient-Crossmodal-Learning/tree/main.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A New Basis for Sparse Principal Component Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2007.00596v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2007.00596v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Chen, Karl Rohe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous versions of sparse principal component analysis (PCA) have presumed
+that the eigen-basis (a $p \times k$ matrix) is approximately sparse. We
+propose a method that presumes the $p \times k$ matrix becomes approximately
+sparse after a $k \times k$ rotation. The simplest version of the algorithm
+initializes with the leading $k$ principal components. Then, the principal
+components are rotated with an $k \times k$ orthogonal rotation to make them
+approximately sparse. Finally, soft-thresholding is applied to the rotated
+principal components. This approach differs from prior approaches because it
+uses an orthogonal rotation to approximate a sparse basis. One consequence is
+that a sparse component need not to be a leading eigenvector, but rather a
+mixture of them. In this way, we propose a new (rotated) basis for sparse PCA.
+In addition, our approach avoids "deflation" and multiple tuning parameters
+required for that. Our sparse PCA framework is versatile; for example, it
+extends naturally to a two-way analysis of a data matrix for simultaneous
+dimensionality reduction of rows and columns. We provide evidence showing that
+for the same level of sparsity, the proposed sparse PCA method is more stable
+and can explain more variance compared to alternative methods. Through three
+applications -- sparse coding of images, analysis of transcriptome sequencing
+data, and large-scale clustering of social networks, we demonstrate the modern
+usefulness of sparse PCA in exploring multivariate data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Null-text Guidance in Diffusion Models is Secretly a Cartoon-style
+  Creator <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06710v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06710v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Zhao, Heliang Zheng, Chaoyue Wang, Long Lan, Wanrong Huang, Wenjing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifier-free guidance is an effective sampling technique in diffusion
+models that has been widely adopted. The main idea is to extrapolate the model
+in the direction of text guidance and away from null-text guidance. In this
+paper, we demonstrate that null-text guidance in diffusion models is secretly a
+cartoon-style creator, i.e., the generated images can be efficiently
+transformed into cartoons by simply perturbing the null-text guidance.
+Specifically, we proposed two disturbance methods, i.e., Rollback disturbance
+(Back-D) and Image disturbance (Image-D), to construct misalignment between the
+noisy images used for predicting null-text guidance and text guidance
+(subsequently referred to as \textbf{null-text noisy image} and \textbf{text
+noisy image} respectively) in the sampling process. Back-D achieves
+cartoonization by altering the noise level of null-text noisy image via
+replacing $x_t$ with $x_{t+\Delta t}$. Image-D, alternatively, produces
+high-fidelity, diverse cartoons by defining $x_t$ as a clean input image, which
+further improves the incorporation of finer image details. Through
+comprehensive experiments, we delved into the principle of noise disturbing for
+null-text and uncovered that the efficacy of disturbance depends on the
+correlation between the null-text noisy image and the source image. Moreover,
+our proposed techniques, which can generate cartoon images and cartoonize
+specific ones, are training-free and easily integrated as a plug-and-play
+component in any classifier-free guided diffusion model. Project page is
+available at \url{https://nulltextforcartoon.github.io/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEVControl: Accurately Controlling Street-view Elements with
+  Multi-perspective Consistency via BEV Sketch Layout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kairui Yang, Enhui Ma, Jibin Peng, Qing Guo, Di Lin, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using synthesized images to boost the performance of perception models is a
+long-standing research challenge in computer vision. It becomes more eminent in
+visual-centric autonomous driving systems with multi-view cameras as some
+long-tail scenarios can never be collected. Guided by the BEV segmentation
+layouts, the existing generative networks seem to synthesize photo-realistic
+street-view images when evaluated solely on scene-level metrics. However, once
+zoom-in, they usually fail to produce accurate foreground and background
+details such as heading. To this end, we propose a two-stage generative method,
+dubbed BEVControl, that can generate accurate foreground and background
+contents. In contrast to segmentation-like input, it also supports sketch style
+input, which is more flexible for humans to edit. In addition, we propose a
+comprehensive multi-level evaluation protocol to fairly compare the quality of
+the generated scene, foreground object, and background geometry. Our extensive
+experiments show that our BEVControl surpasses the state-of-the-art method,
+BEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation
+mIoU. In addition, we show that using images generated by BEVControl to train
+the downstream perception model, it achieves on average 1.29 improvement in NDS
+score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMID: A Unified <span class="highlight-title">Self-Supervised</span> Learning Framework for Remote Sensing
+  Image Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dilxat Muhtar, Xueliang Zhang, Pengfeng Xiao, Zhenshi Li, Feng Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has gained widespread attention in the remote
+sensing (RS) and earth observation (EO) communities owing to its ability to
+learn task-agnostic representations without human-annotated labels.
+Nevertheless, most existing RS SSL methods are limited to learning either
+global semantic separable or local spatial perceptible representations. We
+argue that this learning strategy is suboptimal in the realm of RS, since the
+required representations for different RS downstream tasks are often varied and
+complex. In this study, we proposed a unified SSL framework that is better
+suited for RS images representation learning. The proposed SSL framework,
+Contrastive Mask Image Distillation (CMID), is capable of learning
+representations with both global semantic separability and local spatial
+perceptibility by combining contrastive learning (CL) with masked image
+modeling (MIM) in a self-distillation way. Furthermore, our CMID learning
+framework is architecture-agnostic, which is compatible with both convolutional
+neural networks (CNN) and vision transformers (ViT), allowing CMID to be easily
+adapted to a variety of deep learning (DL) applications for RS understanding.
+Comprehensive experiments have been carried out on four downstream tasks (i.e.
+scene classification, semantic segmentation, object-detection, and change
+detection) and the results show that models pre-trained using CMID achieve
+better performance than other state-of-the-art SSL methods on multiple
+downstream tasks. The code and pre-trained models will be made available at
+https://github.com/NJU-LHRS/official-CMID to facilitate SSL research and speed
+up the development of RS images DL applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TGRS. The codes and models are released at
+  https://github.com/NJU-LHRS/official-CMID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying Low-Light Image Enhancement Networks with Relative Loss
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Xiaoguang Di, Junde Wu, Rao Fu, Yong Li, Yue Wang, Yanwu Xu, Guohui Yang, Chunhui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image enhancement is a common technique used to mitigate issues such as
+severe noise, low brightness, low contrast, and color deviation in low-light
+images. However, providing an optimal high-light image as a reference for
+low-light image enhancement tasks is impossible, which makes the learning
+process more difficult than other image processing tasks. As a result, although
+several low-light image enhancement methods have been proposed, most of them
+are either too complex or insufficient in addressing all the issues in
+low-light images. In this paper, to make the learning easier in low-light image
+enhancement, we introduce FLW-Net (Fast and LightWeight Network) and two
+relative loss functions. Specifically, we first recognize the challenges of the
+need for a large receptive field to obtain global contrast and the lack of an
+absolute reference, which limits the simplification of network structures in
+this task. Then, we propose an efficient global feature information extraction
+component and two loss functions based on relative information to overcome
+these challenges. Finally, we conducted comparative experiments to demonstrate
+the effectiveness of the proposed method, and the results confirm that the
+proposed method can significantly reduce the complexity of supervised low-light
+image enhancement networks while improving processing effect. The code is
+available at \url{https://github.com/hitzhangyu/FLW-Net}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MRQ:Support Multiple Quantization Schemes through Model Re-Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manasa Manohara, Sankalp Dayal, Tariq Afzal, Rahul Bakshi, Kahkuen Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the proliferation of diverse hardware accelerators (e.g., NPU, TPU,
+DPU), deploying deep learning models on edge devices with fixed-point hardware
+is still challenging due to complex model quantization and conversion. Existing
+model quantization frameworks like Tensorflow QAT [1], TFLite PTQ [2], and
+Qualcomm AIMET [3] supports only a limited set of quantization schemes (e.g.,
+only asymmetric per-tensor quantization in TF1.x QAT [4]). Accordingly, deep
+learning models cannot be easily quantized for diverse fixed-point hardwares,
+mainly due to slightly different quantization requirements. In this paper, we
+envision a new type of model quantization approach called MRQ (model
+re-quantization), which takes existing quantized models and quickly transforms
+the models to meet different quantization requirements (e.g., asymmetric ->
+symmetric, non-power-of-2 scale -> power-of-2 scale). Re-quantization is much
+simpler than quantizing from scratch because it avoids costly re-training and
+provides support for multiple quantization schemes simultaneously. To minimize
+re-quantization error, we developed a new set of re-quantization algorithms
+including weight correction and rounding error folding. We have demonstrated
+that MobileNetV2 QAT model [7] can be quickly re-quantized into two different
+quantization schemes (i.e., symmetric and symmetric+power-of-2 scale) with less
+than 0.64 units of accuracy loss. We believe our work is the first to leverage
+this concept of re-quantization for model quantization and models obtained from
+the re-quantization process have been successfully deployed on NNA in the Echo
+Show devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 3 tables, TinyML Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Teacher Agent: A Knowledge Distillation-Free Framework for
+  Rehearsal-based Video Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00393v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00393v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengqin Jiang, Yaoyu Fang, Haokui Zhang, Qingshan Liu, Yuankai Qi, Yang Yang, Peng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rehearsal-based video incremental learning often employs knowledge
+distillation to mitigate catastrophic forgetting of previously learned data.
+However, this method faces two major challenges for video task: substantial
+computing resources from loading teacher model and limited replay capability
+from performance-limited teacher model. To address these problems, we first
+propose a knowledge distillation-free framework for rehearsal-based video
+incremental learning called \textit{Teacher Agent}. Instead of loading
+parameter-heavy teacher networks, we introduce an agent generator that is
+either parameter-free or uses only a few parameters to obtain accurate and
+reliable soft labels. This method not only greatly reduces the computing
+requirement but also circumvents the problem of knowledge misleading caused by
+inaccurate predictions of the teacher model. Moreover, we put forward a
+self-correction loss which provides an effective regularization signal for the
+review of old knowledge, which in turn alleviates the problem of catastrophic
+forgetting. Further, to ensure that the samples in the memory buffer are
+memory-efficient and representative, we introduce a unified sampler for
+rehearsal-based video incremental learning to mine fixed-length key video
+frames. Interestingly, based on the proposed strategies, the network exhibits a
+high level of robustness against spatial resolution reduction when compared to
+the baseline. Extensive experiments demonstrate the advantages of our method,
+yielding significant performance improvements while utilizing only half the
+spatial resolution of video clips as network inputs in the incremental phases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Preferential Attached kNN Graph With Distribution-Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaojie Min, Ji Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based kNN algorithms have garnered widespread popularity for machine
+learning tasks, due to their simplicity and effectiveness. However, the
+conventional kNN graph's reliance on a fixed value of k can hinder its
+performance, especially in scenarios involving complex data distributions.
+Moreover, like other classification models, the presence of ambiguous samples
+along decision boundaries often presents a challenge, as they are more prone to
+incorrect classification. To address these issues, we propose the Preferential
+Attached k-Nearest Neighbors Graph (paNNG), which combines adaptive kNN with
+distribution-based graph construction. By incorporating distribution
+information, paNNG can significantly improve performance for ambiguous samples
+by "pulling" them towards their original classes and hence enable enhanced
+overall accuracy and generalization capability. Through rigorous evaluations on
+diverse benchmark datasets, paNNG outperforms state-of-the-art algorithms,
+showcasing its adaptability and efficacy across various real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyang Mao, Wei Ju, Yifang Qin, Xiao Luo, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph classification is a crucial task in many real-world multimedia
+applications, where graphs can represent various multimedia data types such as
+images, videos, and social networks. Previous efforts have applied graph neural
+networks (GNNs) in balanced situations where the class distribution is
+balanced. However, real-world data typically exhibit long-tailed class
+distributions, resulting in a bias towards the head classes when using GNNs and
+limited generalization ability over the tail classes. Recent approaches mainly
+focus on re-balancing different classes during model training, which fails to
+explicitly introduce new knowledge and sacrifices the performance of the head
+classes. To address these drawbacks, we propose a novel framework called
+Retrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature
+extractor and an unbiased classifier in a decoupled manner. In the feature
+extractor training stage, we develop a graph retrieval module to search for
+relevant graphs that directly enrich the intra-class diversity for the tail
+classes. Moreover, we innovatively optimize a category-centered supervised
+contrastive loss to obtain discriminative representations, which is more
+suitable for long-tailed scenarios. In the classifier fine-tuning stage, we
+balance the classifier weights with two weight regularization techniques, i.e.,
+Max-norm and weight decay. Experiments on various popular benchmarks verify the
+superiority of the proposed method against state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the ACM International Conference on Multimedia (MM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Select the Relevant History Turns in Conversational Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Munazza Zaib, Wei Emma Zhang, Quan Z. Sheng, Subhash Sagar, Adnan Mahmood, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing demand for the web-based digital assistants has given a rapid
+rise in the interest of the Information Retrieval (IR) community towards the
+field of conversational question answering (ConvQA). However, one of the
+critical aspects of ConvQA is the effective selection of conversational history
+turns to answer the question at hand. The dependency between relevant history
+selection and correct answer prediction is an intriguing but under-explored
+area. The selected relevant context can better guide the system so as to where
+exactly in the passage to look for an answer. Irrelevant context, on the other
+hand, brings noise to the system, thereby resulting in a decline in the model's
+performance. In this paper, we propose a framework, DHS-ConvQA (Dynamic History
+Selection in Conversational Question Answering), that first generates the
+context and question entities for all the history turns, which are then pruned
+on the basis of similarity they share in common with the question at hand. We
+also propose an attention-based mechanism to re-rank the pruned terms based on
+their calculated weights of how useful they are in answering the question. In
+the end, we further aid the model by highlighting the terms in the re-ranked
+conversational history using a binary classification task and keeping the
+useful terms (predicted as 1) and ignoring the irrelevant terms (predicted as
+0). We demonstrate the efficacy of our proposed framework with extensive
+experimental results on CANARD and QuAC -- the two popularly utilized datasets
+in ConvQA. We demonstrate that selecting relevant turns works better than
+rewriting the original question. We also investigate how adding the irrelevant
+history turns negatively impacts the model's performance and discuss the
+research challenges that demand more attention from the IR community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimally Computing Compressed Indexing Arrays Based on the Compact
+  Directed Acyclic Word Graph <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroki Arimura, Shunsuke Inenaga, Yasuaki Kobayashi, Yuto Nakashima, Mizuki Sue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the first study of the computational complexity of
+converting an automata-based text index structure, called the Compact Directed
+Acyclic Word Graph (CDAWG), of size $e$ for a text $T$ of length $n$ into other
+text indexing structures for the same text, suitable for highly repetitive
+texts: the run-length BWT of size $r$, the irreducible PLCP array of size $r$,
+and the quasi-irreducible LPF array of size $e$, as well as the lex-parse of
+size $O(r)$ and the LZ77-parse of size $z$, where $r, z \le e$. As main
+results, we showed that the above structures can be optimally computed from
+either the CDAWG for $T$ stored in read-only memory or its self-index version
+of size $e$ without a text in $O(e)$ worst-case time and words of working
+space. To obtain the above results, we devised techniques for enumerating a
+particular subset of suffixes in the lexicographic and text orders using the
+forward and backward search on the CDAWG by extending the results by
+Belazzougui et al. in 2015.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The short version of this paper will appear in SPIRE 2023, Pisa,
+  Italy, September 26-28, 2023, Lecture Notes in Computer Science, Springer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Tori: <span class="highlight-title">Self-supervised</span> Learning for Analyzing Korean Folk Song 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danbinaerin Han, Rafael Caro Repetto, Dasaem Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a computational analysis of the field recording
+dataset of approximately 700 hours of Korean folk songs, which were recorded
+around 1980-90s. Because most of the songs were sung by non-expert musicians
+without accompaniment, the dataset provides several challenges. To address this
+challenge, we utilized self-supervised learning with convolutional neural
+network based on pitch contour, then analyzed how the musical concept of tori,
+a classification system defined by a specific scale, ornamental notes, and an
+idiomatic melodic contour, is captured by the model. The experimental result
+shows that our approach can better capture the characteristics of tori compared
+to traditional pitch histograms. Using our approaches, we have examined how
+musical discussions proposed in existing academia manifest in the actual field
+recordings of Korean folk songs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 24th International Society for Music Information
+  Retrieval Conference (ISMIR 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Personalized <span class="highlight-title">Prompt</span>-Model Retrieval for Generative
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhe Guo, Haoming Liu, Hongyi Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender Systems are built to retrieve relevant items to satisfy users'
+information needs. The candidate corpus usually consists of a finite set of
+items that are ready to be served, such as videos, products, or articles. With
+recent advances in Generative AI such as GPT and Diffusion models, a new form
+of recommendation task is yet to be explored where items are to be created by
+generative models with personalized prompts. Taking image generation as an
+example, with a single prompt from the user and access to a generative model,
+it is possible to generate hundreds of new images in a few minutes. How shall
+we attain personalization in the presence of "infinite" items? In this
+preliminary study, we propose a two-stage framework, namely Prompt-Model
+Retrieval and Generated Item Ranking, to approach this new task formulation. We
+release GEMRec-18K, a prompt-model interaction dataset with 18K images
+generated by 200 publicly-available generative models paired with a diverse set
+of 90 textual prompts. Our findings demonstrate the promise of generative model
+recommendation as a novel personalization problem and the limitations of
+existing evaluation metrics. We highlight future directions for the RecSys
+community to advance towards generative recommender systems. Our code and
+dataset are available at https://github.com/MAPS-research/GEMRec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing the Web and Knowledge Graphs for Automated Impact Investing
+  Scoring <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingzhi Hu, Daniel Daza, Laurens Swinkels, Kristina Ūsaitė, Robbert-Jan 't Hoen, Paul Groth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Sustainable Development Goals (SDGs) were introduced by the United
+Nations in order to encourage policies and activities that help guarantee human
+prosperity and sustainability. SDG frameworks produced in the finance industry
+are designed to provide scores that indicate how well a company aligns with
+each of the 17 SDGs. This scoring enables a consistent assessment of
+investments that have the potential of building an inclusive and sustainable
+economy. As a result of the high quality and reliability required by such
+frameworks, the process of creating and maintaining them is time-consuming and
+requires extensive domain expertise. In this work, we describe a data-driven
+system that seeks to automate the process of creating an SDG framework. First,
+we propose a novel method for collecting and filtering a dataset of texts from
+different web sources and a knowledge graph relevant to a set of companies. We
+then implement and deploy classifiers trained with this data for predicting
+scores of alignment with SDGs for a given company. Our results indicate that
+our best performing model can accurately predict SDG scores with a micro
+average F1 score of 0.89, demonstrating the effectiveness of the proposed
+solution. We further describe how the integration of the models for its use by
+humans can be facilitated by providing explanations in the form of data
+relevant to a predicted score. We find that our proposed solution enables
+access to a large amount of information that analysts would normally not be
+able to process, resulting in an accurate prediction of SDG scores at a
+fraction of the cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the KDD 2023 Workshop - Fragile Earth: AI for Climate
+  Sustainability</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> for GTFS: From Words to Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saipraneeth Devunuri, Shirin Qiam, Lewis Lehe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The General Transit Feed Specification (GTFS) standard for publishing transit
+data is ubiquitous. GTFS being tabular data, with information spread across
+different files, necessitates specialized tools or packages to retrieve
+information. Concurrently, the use of Large Language Models for text and
+information retrieval is growing. The idea of this research is to see if the
+current widely adopted LLMs (ChatGPT) are able to retrieve information from
+GTFS using natural language instructions. We first test whether ChatGPT
+(GPT-3.5) understands the GTFS specification. GPT-3.5 answers 77% of our
+multiple-choice questions (MCQ) correctly. Next, we task the LLM with
+information extractions from a filtered GTFS feed with 4 routes. For
+information retrieval, we compare zero-shot and program synthesis. Program
+synthesis works better, achieving ~90% accuracy on simple questions and ~40%
+accuracy on complex questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures, 1 table, Transportation Research Board</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bi-Encoder Cascades for Efficient Image Search <span class="chip">ICCV '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15595v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15595v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Hönig, Jan Ackermann, Mingyuan Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern neural encoders offer unprecedented text-image retrieval (TIR)
+accuracy, but their high computational cost impedes an adoption to large-scale
+image searches. To lower this cost, model cascades use an expensive encoder to
+refine the ranking of a cheap encoder. However, existing cascading algorithms
+focus on cross-encoders, which jointly process text-image pairs, but do not
+consider cascades of bi-encoders, which separately process texts and images. We
+introduce the small-world search scenario as a realistic setting where
+bi-encoder cascades can reduce costs. We then propose a cascading algorithm
+that leverages the small-world search scenario to reduce lifetime image
+encoding costs of a TIR system. Our experiments show cost reductions by up to
+6x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review as a short paper at the ICCV '23 RCV workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data catalogs play a crucial role in modern data-driven organizations by
+facilitating the discovery, understanding, and utilization of diverse data
+assets. However, ensuring their quality and reliability is complex, especially
+in open and large-scale data environments. This paper proposes a framework to
+automatically determine the quality of open data catalogs, addressing the need
+for efficient and reliable quality assessment mechanisms. Our framework can
+analyze various core quality dimensions, such as accuracy, completeness,
+consistency, scalability, and timeliness, offer several alternatives for the
+assessment of compatibility and similarity across such catalogs as well as the
+implementation of a set of non-core quality dimensions such as provenance,
+readability, and licensing. The goal is to empower data-driven organizations to
+make informed decisions based on trustworthy and well-curated data assets. The
+source code that illustrates our approach can be downloaded from
+https://www.github.com/jorge-martinez-gil/dataq/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Large Language Models for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19860v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19860v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Likang Wu, Zhi Zheng, Zhaopeng Qiu, Hao Wang, Hongchao Gu, Tingjia Shen, Chuan Qin, Chen Zhu, Hengshu Zhu, Qi Liu, Hui Xiong, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as powerful tools in the field of
+Natural Language Processing (NLP) and have recently gained significant
+attention in the domain of Recommendation Systems (RS). These models, trained
+on massive amounts of data using self-supervised learning, have demonstrated
+remarkable success in learning universal representations and have the potential
+to enhance various aspects of recommendation systems by some effective transfer
+techniques such as fine-tuning and prompt tuning, and so on. The crucial aspect
+of harnessing the power of language models in enhancing recommendation quality
+is the utilization of their high-quality representations of textual features
+and their extensive coverage of external knowledge to establish correlations
+between items and users. To provide a comprehensive understanding of the
+existing LLM-based recommendation systems, this survey presents a taxonomy that
+categorizes these models into two major paradigms, respectively Discriminative
+LLM for Recommendation (DLLM4Rec) and Generative LLM for Recommendation
+(GLLM4Rec), with the latter being systematically sorted out for the first time.
+Furthermore, we systematically review and analyze existing LLM-based
+recommendation systems within each paradigm, providing insights into their
+methodologies, techniques, and performance. Additionally, we identify key
+challenges and several valuable findings to provide researchers and
+practitioners with inspiration. We have also created a GitHub repository to
+index relevant papers on LLMs for recommendation,
+https://github.com/WLiK/LLM4Rec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">72</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MM-Vet, an evaluation benchmark that examines large multimodal
+models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various
+intriguing abilities, such as solving math problems written on the blackboard,
+reasoning about events and celebrities in news images, and explaining visual
+jokes. Rapid model advancements pose challenges to evaluation benchmark
+development. Problems include: (1) How to systematically structure and evaluate
+the complicated multimodal tasks; (2) How to design evaluation metrics that
+work well across question and answer types; and (3) How to give model insights
+beyond a simple performance ranking. To this end, we present MM-Vet, designed
+based on the insight that the intriguing ability to solve complicated tasks is
+often achieved by a generalist model being able to integrate different core
+vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and
+examines the 16 integrations of interest derived from the capability
+combination. For evaluation metrics, we propose an LLM-based evaluator for
+open-ended outputs. The evaluator enables the evaluation across different
+question types and answer styles, resulting in a unified scoring metric. We
+evaluate representative LMMs on MM-Vet, providing insights into the
+capabilities of different LMM system paradigms and models. Code and data are
+available at https://github.com/yuweihao/MM-Vet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data: https://github.com/yuweihao/MM-Vet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BlindSage: Label Inference Attacks against Node-level Vertical Federated
+  Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Arazzi, Mauro Conti, Stefanos Koffas, Marina Krcek, Antonino Nocera, Stjepan Picek, Jing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning enables collaborative training of machine learning models
+by keeping the raw data of the involved workers private. One of its main
+objectives is to improve the models' privacy, security, and scalability.
+Vertical Federated Learning (VFL) offers an efficient cross-silo setting where
+a few parties collaboratively train a model without sharing the same features.
+In such a scenario, classification labels are commonly considered sensitive
+information held exclusively by one (active) party, while other (passive)
+parties use only their local information. Recent works have uncovered important
+flaws of VFL, leading to possible label inference attacks under the assumption
+that the attacker has some, even limited, background knowledge on the relation
+between labels and data. In this work, we are the first (to the best of our
+knowledge) to investigate label inference attacks on VFL using a
+zero-background knowledge strategy. To concretely formulate our proposal, we
+focus on Graph Neural Networks (GNNs) as a target model for the underlying VFL.
+In particular, we refer to node classification tasks, which are widely studied,
+and GNNs have shown promising results. Our proposed attack, BlindSage, provides
+impressive results in the experiments, achieving nearly 100% accuracy in most
+cases. Even when the attacker has no information about the used architecture or
+the number of classes, the accuracy remained above 85% in most instances.
+Finally, we observe that well-known defenses cannot mitigate our attack without
+affecting the model's performance on the main classification task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Approximation of Linear Time-Invariant (LTI) Systems through
+  RNNs: Power of Randomness in Reservoir Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Jere, Lizhong Zheng, Karim Said, Lingjia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent neural networks (RNNs) are known to be universal approximators of
+dynamic systems under fairly mild and general assumptions, making them good
+tools to process temporal information. However, RNNs usually suffer from the
+issues of vanishing and exploding gradients in the standard RNN training.
+Reservoir computing (RC), a special RNN where the recurrent weights are
+randomized and left untrained, has been introduced to overcome these issues and
+has demonstrated superior empirical performance in fields as diverse as natural
+language processing and wireless communications especially in scenarios where
+training samples are extremely limited. On the contrary, the theoretical
+grounding to support this observed performance has not been fully developed at
+the same pace. In this work, we show that RNNs can provide universal
+approximation of linear time-invariant (LTI) systems. Specifically, we show
+that RC can universally approximate a general LTI system. We present a clear
+signal processing interpretation of RC and utilize this understanding in the
+problem of simulating a generic LTI system through RC. Under this setup, we
+analytically characterize the optimal probability distribution function for
+generating the recurrent weights of the underlying RNN of the RC. We provide
+extensive numerical evaluations to validate the optimality of the derived
+optimum distribution of the recurrent weights of the RC for the LTI system
+simulation problem. Our work results in clear signal processing-based model
+interpretability of RC and provides theoretical explanation for the power of
+randomness in setting instead of training RC's recurrent weights. It further
+provides a complete optimum analytical characterization for the untrained
+recurrent weights, marking an important step towards explainable machine
+learning (XML) which is extremely important for applications where training
+samples are limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast and Accurate Reduced-Order Modeling of a MOOSE-based Additive
+  Manufacturing Model with Operator Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmoud Yaseen, Dewen Yushu, Peter German, Xu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One predominant challenge in additive manufacturing (AM) is to achieve
+specific material properties by manipulating manufacturing process parameters
+during the runtime. Such manipulation tends to increase the computational load
+imposed on existing simulation tools employed in AM. The goal of the present
+work is to construct a fast and accurate reduced-order model (ROM) for an AM
+model developed within the Multiphysics Object-Oriented Simulation Environment
+(MOOSE) framework, ultimately reducing the time/cost of AM control and
+optimization processes. Our adoption of the operator learning (OL) approach
+enabled us to learn a family of differential equations produced by altering
+process variables in the laser's Gaussian point heat source. More specifically,
+we used the Fourier neural operator (FNO) and deep operator network (DeepONet)
+to develop ROMs for time-dependent responses. Furthermore, we benchmarked the
+performance of these OL methods against a conventional deep neural network
+(DNN)-based ROM. Ultimately, we found that OL methods offer comparable
+performance and, in terms of accuracy and generalizability, even outperform DNN
+at predicting scalar model responses. The DNN-based ROM afforded the fastest
+training time. Furthermore, all the ROMs were faster than the original MOOSE
+model yet still provided accurate predictions. FNO had a smaller mean
+prediction error than DeepONet, with a larger variance for time-dependent
+responses. Unlike DNN, both FNO and DeepONet were able to simulate time series
+data without the need for dimensionality reduction techniques. The present work
+can help facilitate the AM optimization process by enabling faster execution of
+simulation tools while still preserving evaluation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 18 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonprehensile Planar Manipulation through Reinforcement Learning with
+  Multimodal Categorical Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Del Aguila Ferrandis, João Moura, Sethu Vijayakumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing robot controllers capable of achieving dexterous nonprehensile
+manipulation, such as pushing an object on a table, is challenging. The
+underactuated and hybrid-dynamics nature of the problem, further complicated by
+the uncertainty resulting from the frictional interactions, requires
+sophisticated control behaviors. Reinforcement Learning (RL) is a powerful
+framework for developing such robot controllers. However, previous RL
+literature addressing the nonprehensile pushing task achieves low accuracy,
+non-smooth trajectories, and only simple motions, i.e. without rotation of the
+manipulated object. We conjecture that previously used unimodal exploration
+strategies fail to capture the inherent hybrid-dynamics of the task, arising
+from the different possible contact interaction modes between the robot and the
+object, such as sticking, sliding, and separation. In this work, we propose a
+multimodal exploration approach through categorical distributions, which
+enables us to train planar pushing RL policies for arbitrary starting and
+target object poses, i.e. positions and orientations, and with improved
+accuracy. We show that the learned policies are robust to external disturbances
+and observation noise, and scale to tasks with multiple pushers. Furthermore,
+we validate the transferability of the learned policies, trained entirely in
+simulation, to a physical robot hardware using the KUKA iiwa robot arm. See our
+supplemental video: https://youtu.be/vTdva1mgrk4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Modelling of Lévy Area for High Order SDE Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andraž Jelinčič, Jiajie Tao, William F. Turner, Thomas Cass, James Foster, Hao Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is well known that, when numerically simulating solutions to SDEs,
+achieving a strong convergence rate better than O(\sqrt{h}) (where h is the
+step size) requires the use of certain iterated integrals of Brownian motion,
+commonly referred to as its "L\'{e}vy areas". However, these stochastic
+integrals are difficult to simulate due to their non-Gaussian nature and for a
+d-dimensional Brownian motion with d > 2, no fast almost-exact sampling
+algorithm is known.
+  In this paper, we propose L\'{e}vyGAN, a deep-learning-based model for
+generating approximate samples of L\'{e}vy area conditional on a Brownian
+increment. Due to our "Bridge-flipping" operation, the output samples match all
+joint and conditional odd moments exactly. Our generator employs a tailored
+GNN-inspired architecture, which enforces the correct dependency structure
+between the output distribution and the conditioning variable. Furthermore, we
+incorporate a mathematically principled characteristic-function based
+discriminator. Lastly, we introduce a novel training mechanism termed
+"Chen-training", which circumvents the need for expensive-to-generate training
+data-sets. This new training procedure is underpinned by our two main
+theoretical results.
+  For 4-dimensional Brownian motion, we show that L\'{e}vyGAN exhibits
+state-of-the-art performance across several metrics which measure both the
+joint and marginal distributions. We conclude with a numerical experiment on
+the log-Heston model, a popular SDE in mathematical finance, demonstrating that
+high-quality synthetic L\'{e}vy area can lead to high order weak convergence
+and variance reduction when using multilevel Monte Carlo (MLMC).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pruning a neural network using Bayesian inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunil Mathew, Daniel B. Rowe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network pruning is a highly effective technique aimed at reducing the
+computational and memory demands of large neural networks. In this research
+paper, we present a novel approach to pruning neural networks utilizing
+Bayesian inference, which can seamlessly integrate into the training procedure.
+Our proposed method leverages the posterior probabilities of the neural network
+prior to and following pruning, enabling the calculation of Bayes factors. The
+calculated Bayes factors guide the iterative pruning. Through comprehensive
+evaluations conducted on multiple benchmarks, we demonstrate that our method
+achieves desired levels of sparsity while maintaining competitive accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Military to Healthcare: Adopting and Expanding Ethical Principles
+  for Generative Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Oniani, Jordan Hilsman, Yifan Peng,  COL, Ronald K. Poropatich, COL Jeremy C. Pamplin, LTC Gary L. Legault, Yanshan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 2020, the U.S. Department of Defense officially disclosed a set of ethical
+principles to guide the use of Artificial Intelligence (AI) technologies on
+future battlefields. Despite stark differences, there are core similarities
+between the military and medical service. Warriors on battlefields often face
+life-altering circumstances that require quick decision-making. Medical
+providers experience similar challenges in a rapidly changing healthcare
+environment, such as in the emergency department or during surgery treating a
+life-threatening condition. Generative AI, an emerging technology designed to
+efficiently generate valuable information, holds great promise. As computing
+power becomes more accessible and the abundance of health data, such as
+electronic health records, electrocardiograms, and medical images, increases,
+it is inevitable that healthcare will be revolutionized by this technology.
+Recently, generative AI has captivated the research community, leading to
+debates about its application in healthcare, mainly due to concerns about
+transparency and related issues. Meanwhile, concerns about the potential
+exacerbation of health disparities due to modeling biases have raised notable
+ethical concerns regarding the use of this technology in healthcare. However,
+the ethical principles for generative AI in healthcare have been understudied,
+and decision-makers often fail to consider the significance of generative AI.
+In this paper, we propose GREAT PLEA ethical principles, encompassing
+governance, reliability, equity, accountability, traceability, privacy,
+lawfulness, empathy, and autonomy, for generative AI in healthcare. We aim to
+proactively address the ethical dilemmas and challenges posed by the
+integration of generative AI in healthcare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Preferential Attached kNN Graph With Distribution-Awareness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaojie Min, Ji Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based kNN algorithms have garnered widespread popularity for machine
+learning tasks, due to their simplicity and effectiveness. However, the
+conventional kNN graph's reliance on a fixed value of k can hinder its
+performance, especially in scenarios involving complex data distributions.
+Moreover, like other classification models, the presence of ambiguous samples
+along decision boundaries often presents a challenge, as they are more prone to
+incorrect classification. To address these issues, we propose the Preferential
+Attached k-Nearest Neighbors Graph (paNNG), which combines adaptive kNN with
+distribution-based graph construction. By incorporating distribution
+information, paNNG can significantly improve performance for ambiguous samples
+by "pulling" them towards their original classes and hence enable enhanced
+overall accuracy and generalization capability. Through rigorous evaluations on
+diverse benchmark datasets, paNNG outperforms state-of-the-art algorithms,
+showcasing its adaptability and efficacy across various real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Optimal Admission Control in Partially Observable Queueing
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonatha Anselmi, Bruno Gaujal, Louis-Sébastien Rebuffi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an efficient reinforcement learning algorithm that learns the
+optimal admission control policy in a partially observable queueing network.
+Specifically, only the arrival and departure times from the network are
+observable, and optimality refers to the average holding/rejection cost in
+infinite horizon.
+  While reinforcement learning in Partially Observable Markov Decision
+Processes (POMDP) is prohibitively expensive in general, we show that our
+algorithm has a regret that only depends sub-linearly on the maximal number of
+jobs in the network, $S$. In particular, in contrast with existing regret
+analyses, our regret bound does not depend on the diameter of the underlying
+Markov Decision Process (MDP), which in most queueing systems is at least
+exponential in $S$.
+  The novelty of our approach is to leverage Norton's equivalent theorem for
+closed product-form queueing networks and an efficient reinforcement learning
+algorithm for MDPs with the structure of birth-and-death processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Survival Analysis in Healthcare with Federated Survival Forests:
+  A Comparative Study on Heart Failure and Breast Cancer Genomics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Archetti, Francesca Ieva, Matteo Matteucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival analysis is a fundamental tool in medicine, modeling the time until
+an event of interest occurs in a population. However, in real-world
+applications, survival data are often incomplete, censored, distributed, and
+confidential, especially in healthcare settings where privacy is critical. The
+scarcity of data can severely limit the scalability of survival models to
+distributed applications that rely on large data pools. Federated learning is a
+promising technique that enables machine learning models to be trained on
+multiple datasets without compromising user privacy, making it particularly
+well-suited for addressing the challenges of survival data and large-scale
+survival applications. Despite significant developments in federated learning
+for classification and regression, many directions remain unexplored in the
+context of survival analysis. In this work, we propose an extension of the
+Federated Survival Forest algorithm, called FedSurF++. This federated ensemble
+method constructs random survival forests in heterogeneous federations.
+Specifically, we investigate several new tree sampling methods from client
+forests and compare the results with state-of-the-art survival models based on
+neural networks. The key advantage of FedSurF++ is its ability to achieve
+comparable performance to existing methods while requiring only a single
+communication round to complete. The extensive empirical investigation results
+in a significant improvement from the algorithmic and privacy preservation
+perspectives, making the original FedSurF algorithm more efficient, robust, and
+private. We also present results on two real-world datasets demonstrating the
+success of FedSurF++ in real-world healthcare studies. Our results underscore
+the potential of FedSurF++ to improve the scalability and effectiveness of
+survival analysis in distributed settings while preserving user privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Machine Learning Method for Predicting Traffic Signal Timing from
+  Probe Vehicle Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juliette Ugirumurera, Joseph Severino, Erik A. Bensen, Qichao Wang, Jane Macfarlane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic signals play an important role in transportation by enabling traffic
+flow management, and ensuring safety at intersections. In addition, knowing the
+traffic signal phase and timing data can allow optimal vehicle routing for time
+and energy efficiency, eco-driving, and the accurate simulation of signalized
+road networks. In this paper, we present a machine learning (ML) method for
+estimating traffic signal timing information from vehicle probe data. To the
+authors best knowledge, very few works have presented ML techniques for
+determining traffic signal timing parameters from vehicle probe data. In this
+work, we develop an Extreme Gradient Boosting (XGBoost) model to estimate
+signal cycle lengths and a neural network model to determine the corresponding
+red times per phase from probe data. The green times are then be derived from
+the cycle length and red times. Our results show an error of less than 0.56 sec
+for cycle length, and red times predictions within 7.2 sec error on average.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intensity-free Integral-based Learning of Marked Temporal Point
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sishun Liu, Ke Deng, Jenny Zhang, Yongli Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the marked temporal point processes (MTPP), a core problem is to
+parameterize the conditional joint PDF (probability distribution function)
+$p^*(m,t)$ for inter-event time $t$ and mark $m$, conditioned on the history.
+The majority of existing studies predefine intensity functions. Their utility
+is challenged by specifying the intensity function's proper form, which is
+critical to balance expressiveness and processing efficiency. Recently, there
+are studies moving away from predefining the intensity function -- one models
+$p^*(t)$ and $p^*(m)$ separately, while the other focuses on temporal point
+processes (TPPs), which do not consider marks. This study aims to develop
+high-fidelity $p^*(m,t)$ for discrete events where the event marks are either
+categorical or numeric in a multi-dimensional continuous space. We propose a
+solution framework IFIB (\underline{I}ntensity-\underline{f}ree
+\underline{I}ntegral-\underline{b}ased process) that models conditional joint
+PDF $p^*(m,t)$ directly without intensity functions. It remarkably simplifies
+the process to compel the essential mathematical restrictions. We show the
+desired properties of IFIB and the superior experimental results of IFIB on
+real-world and synthetic datasets. The code is available at
+\url{https://github.com/StepinSilence/IFIB}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting to Change: Robust Counterfactual Explanations in Dynamic Data
+  Landscapes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bardh Prenkaj, Mario Villaizan-Vallelado, Tobias Leemann, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel semi-supervised Graph Counterfactual Explainer (GCE)
+methodology, Dynamic GRAph Counterfactual Explainer (DyGRACE). It leverages
+initial knowledge about the data distribution to search for valid
+counterfactuals while avoiding using information from potentially outdated
+decision functions in subsequent time steps. Employing two graph autoencoders
+(GAEs), DyGRACE learns the representation of each class in a binary
+classification scenario. The GAEs minimise the reconstruction error between the
+original graph and its learned representation during training. The method
+involves (i) optimising a parametric density function (implemented as a
+logistic regression function) to identify counterfactuals by maximising the
+factual autoencoder's reconstruction error, (ii) minimising the counterfactual
+autoencoder's error, and (iii) maximising the similarity between the factual
+and counterfactual graphs. This semi-supervised approach is independent of an
+underlying black-box oracle. A logistic regression model is trained on a set of
+graph pairs to learn weights that aid in finding counterfactuals. At inference,
+for each unseen graph, the logistic regressor identifies the best
+counterfactual candidate using these learned weights, while the GAEs can be
+iteratively updated to represent the continual adaptation of the learned graph
+representation over iterations. DyGRACE is quite effective and can act as a
+drift detector, identifying distributional drift based on differences in
+reconstruction errors between iterations. It avoids reliance on the oracle's
+predictions in successive iterations, thereby increasing the efficiency of
+counterfactual discovery. DyGRACE, with its capacity for contrastive learning
+and drift detection, will offer new avenues for semi-supervised learning and
+explanation generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RobustMQ: Benchmarking Robustness of Quantized Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yisong Xiao, Aishan Liu, Tianyuan Zhang, Haotong Qin, Jinyang Guo, Xianglong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization has emerged as an essential technique for deploying deep neural
+networks (DNNs) on devices with limited resources. However, quantized models
+exhibit vulnerabilities when exposed to various noises in real-world
+applications. Despite the importance of evaluating the impact of quantization
+on robustness, existing research on this topic is limited and often disregards
+established principles of robustness evaluation, resulting in incomplete and
+inconclusive findings. To address this gap, we thoroughly evaluated the
+robustness of quantized models against various noises (adversarial attacks,
+natural corruptions, and systematic noises) on ImageNet. The comprehensive
+evaluation results empirically provide valuable insights into the robustness of
+quantized models in various scenarios, for example: (1) quantized models
+exhibit higher adversarial robustness than their floating-point counterparts,
+but are more vulnerable to natural corruptions and systematic noises; (2) in
+general, increasing the quantization bit-width results in a decrease in
+adversarial robustness, an increase in natural robustness, and an increase in
+systematic robustness; (3) among corruption methods, \textit{impulse noise} and
+\textit{glass blur} are the most harmful to quantized models, while
+\textit{brightness} has the least impact; (4) among systematic noises, the
+\textit{nearest neighbor interpolation} has the highest impact, while bilinear
+interpolation, cubic interpolation, and area interpolation are the three least
+harmful. Our research contributes to advancing the robust quantization of
+models and their deployment in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stability and Generalization of Hypergraph Collaborative Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Ng, Hanrui Wu, Andy Yip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks have been shown to be very effective in utilizing
+pairwise relationships across samples. Recently, there have been several
+successful proposals to generalize graph neural networks to hypergraph neural
+networks to exploit more complex relationships. In particular, the hypergraph
+collaborative networks yield superior results compared to other hypergraph
+neural networks for various semi-supervised learning tasks. The collaborative
+network can provide high quality vertex embeddings and hyperedge embeddings
+together by formulating them as a joint optimization problem and by using their
+consistency in reconstructing the given hypergraph. In this paper, we aim to
+establish the algorithmic stability of the core layer of the collaborative
+network and provide generalization guarantees. The analysis sheds light on the
+design of hypergraph filters in collaborative networks, for instance, how the
+data and hypergraph filters should be scaled to achieve uniform stability of
+the learning process. Some experimental results on real-world datasets are
+presented to illustrate the theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Networks from Gaussian Graphical Models and Gaussian Free
+  Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhro Ghosh, Soumendu Sundar Mukherjee, Hoang-Son Tran, Ujan Gangopadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the problem of estimating the structure of a weighted network
+from repeated measurements of a Gaussian Graphical Model (GGM) on the network.
+In this vein, we consider GGMs whose covariance structures align with the
+geometry of the weighted network on which they are based. Such GGMs have been
+of longstanding interest in statistical physics, and are referred to as the
+Gaussian Free Field (GFF). In recent years, they have attracted considerable
+interest in the machine learning and theoretical computer science. In this
+work, we propose a novel estimator for the weighted network (equivalently, its
+Laplacian) from repeated measurements of a GFF on the network, based on the
+Fourier analytic properties of the Gaussian distribution. In this pursuit, our
+approach exploits complex-valued statistics constructed from observed data,
+that are of interest on their own right. We demonstrate the effectiveness of
+our estimator with concrete recovery guarantees and bounds on the required
+sample complexity. In particular, we show that the proposed statistic achieves
+the parametric rate of estimation for fixed network size. In the setting of
+networks growing with sample size, our results show that for Erdos-Renyi random
+graphs $G(d,p)$ above the connectivity threshold, we demonstrate that network
+recovery takes place with high probability as soon as the sample size $n$
+satisfies $n \gg d^4 \log d \cdot p^{-2}$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyang Mao, Wei Ju, Yifang Qin, Xiao Luo, Ming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph classification is a crucial task in many real-world multimedia
+applications, where graphs can represent various multimedia data types such as
+images, videos, and social networks. Previous efforts have applied graph neural
+networks (GNNs) in balanced situations where the class distribution is
+balanced. However, real-world data typically exhibit long-tailed class
+distributions, resulting in a bias towards the head classes when using GNNs and
+limited generalization ability over the tail classes. Recent approaches mainly
+focus on re-balancing different classes during model training, which fails to
+explicitly introduce new knowledge and sacrifices the performance of the head
+classes. To address these drawbacks, we propose a novel framework called
+Retrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature
+extractor and an unbiased classifier in a decoupled manner. In the feature
+extractor training stage, we develop a graph retrieval module to search for
+relevant graphs that directly enrich the intra-class diversity for the tail
+classes. Moreover, we innovatively optimize a category-centered supervised
+contrastive loss to obtain discriminative representations, which is more
+suitable for long-tailed scenarios. In the classifier fine-tuning stage, we
+balance the classifier weights with two weight regularization techniques, i.e.,
+Max-norm and weight decay. Experiments on various popular benchmarks verify the
+superiority of the proposed method against state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the ACM International Conference on Multimedia (MM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A stochastic optimization approach to train non-linear neural networks
+  with regularization of higher-order total variation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akifumi Okuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While highly expressive parametric models including deep neural networks have
+an advantage to model complicated concepts, training such highly non-linear
+models is known to yield a high risk of notorious overfitting. To address this
+issue, this study considers a $k$th order total variation ($k$-TV)
+regularization, which is defined as the squared integral of the $k$th order
+derivative of the parametric models to be trained; penalizing the $k$-TV is
+expected to yield a smoother function, which is expected to avoid overfitting.
+While the $k$-TV terms applied to general parametric models are computationally
+intractable due to the integration, this study provides a stochastic
+optimization algorithm, that can efficiently train general models with the
+$k$-TV regularization without conducting explicit numerical integration. The
+proposed approach can be applied to the training of even deep neural networks
+whose structure is arbitrary, as it can be implemented by only a simple
+stochastic gradient descent algorithm and automatic differentiation. Our
+numerical experiments demonstrate that the neural networks trained with the
+$K$-TV terms are more ``resilient'' than those with the conventional parameter
+regularization. The proposed algorithm also can be extended to the
+physics-informed training of neural networks (PINNs).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 24 figures, in preparation for submission; comments are
+  welcome!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frustratingly Easy Model Generalization by Dummy Risk Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juncheng Wang, Jindong Wang, Xixu Hu, Shujun Wang, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empirical risk minimization (ERM) is a fundamental machine learning paradigm.
+However, its generalization ability is limited in various tasks. In this paper,
+we devise Dummy Risk Minimization (DuRM), a frustratingly easy and general
+technique to improve the generalization of ERM. DuRM is extremely simple to
+implement: just enlarging the dimension of the output logits and then
+optimizing using standard gradient descent. Moreover, we validate the efficacy
+of DuRM on both theoretical and empirical analysis. Theoretically, we show that
+DuRM derives greater variance of the gradient, which facilitates model
+generalization by observing better flat local minima. Empirically, we conduct
+evaluations of DuRM across different datasets, modalities, and network
+architectures on diverse tasks, including conventional classification, semantic
+segmentation, out-of-distribution generalization, adverserial training, and
+long-tailed recognition. Results demonstrate that DuRM could consistently
+improve the performance under all tasks with an almost free lunch manner.
+Furthermore, we show that DuRM is compatible with existing generalization
+techniques and we discuss possible limitations. We hope that DuRM could trigger
+new interest in the fundamental research on risk minimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report; 22 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DIVERSIFY: A General Framework for Time Series Out-of-distribution
+  Detection and Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang Lu, Jindong Wang, Xinwei Sun, Yiqiang Chen, Xiangyang Ji, Qiang Yang, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series remains one of the most challenging modalities in machine
+learning research. The out-of-distribution (OOD) detection and generalization
+on time series tend to suffer due to its non-stationary property, i.e., the
+distribution changes over time. The dynamic distributions inside time series
+pose great challenges to existing algorithms to identify invariant
+distributions since they mainly focus on the scenario where the domain
+information is given as prior knowledge. In this paper, we attempt to exploit
+subdomains within a whole dataset to counteract issues induced by
+non-stationary for generalized representation learning. We propose DIVERSIFY, a
+general framework, for OOD detection and generalization on dynamic
+distributions of time series. DIVERSIFY takes an iterative process: it first
+obtains the "worst-case" latent distribution scenario via adversarial training,
+then reduces the gap between these latent distributions. We implement DIVERSIFY
+via combining existing OOD detection methods according to either extracted
+features or outputs of models for detection while we also directly utilize
+outputs for classification. In addition, theoretical insights illustrate that
+DIVERSIFY is theoretically supported. Extensive experiments are conducted on
+seven datasets with different OOD settings across gesture recognition, speech
+commands recognition, wearable stress and affect detection, and sensor-based
+human activity recognition. Qualitative and quantitative results demonstrate
+that DIVERSIFY learns more generalized features and significantly outperforms
+other baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal version of arXiv:2209.07027; 17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Proximal Gradient Method for Convex Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yura Malitsky, Konstantin Mishchenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore two fundamental first-order algorithms in convex
+optimization, namely, gradient descent (GD) and proximal gradient method
+(ProxGD). Our focus is on making these algorithms entirely adaptive by
+leveraging local curvature information of smooth functions. We propose adaptive
+versions of GD and ProxGD that are based on observed gradient differences and,
+thus, have no added computational costs. Moreover, we prove convergence of our
+methods assuming only local Lipschitzness of the gradient. In addition, the
+proposed versions allow for even larger stepsizes than those initially
+suggested in [MM20].
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Tori: <span class="highlight-title">Self-supervised</span> Learning for Analyzing Korean Folk Song 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danbinaerin Han, Rafael Caro Repetto, Dasaem Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a computational analysis of the field recording
+dataset of approximately 700 hours of Korean folk songs, which were recorded
+around 1980-90s. Because most of the songs were sung by non-expert musicians
+without accompaniment, the dataset provides several challenges. To address this
+challenge, we utilized self-supervised learning with convolutional neural
+network based on pitch contour, then analyzed how the musical concept of tori,
+a classification system defined by a specific scale, ornamental notes, and an
+idiomatic melodic contour, is captured by the model. The experimental result
+shows that our approach can better capture the characteristics of tori compared
+to traditional pitch histograms. Using our approaches, we have examined how
+musical discussions proposed in existing academia manifest in the actual field
+recordings of Korean folk songs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 24th International Society for Music Information
+  Retrieval Conference (ISMIR 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Normalizing Neural Network, Enabling One Shot Transfer Learning for
+  Modeling EDFA Wavelength Dependent Gain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agastya Raj, Zehao Wang, Frank Slyne, Tingjun Chen, Dan Kilper, Marco Ruffini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel ML framework for modeling the wavelength-dependent gain of
+multiple EDFAs, based on semi-supervised, self-normalizing neural networks,
+enabling one-shot transfer learning. Our experiments on 22 EDFAs in Open
+Ireland and COSMOS testbeds show high-accuracy transfer-learning even when
+operated across different amplifier types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted for the European Conference on Optical
+  Communications (ECOC) 2023, this version is a pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Likelihood-ratio-based confidence intervals for neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurens Sluijterman, Eric Cator, Tom Heskes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a first implementation of a novel
+likelihood-ratio-based approach for constructing confidence intervals for
+neural networks. Our method, called DeepLR, offers several qualitative
+advantages: most notably, the ability to construct asymmetric intervals that
+expand in regions with a limited amount of data, and the inherent incorporation
+of factors such as the amount of training time, network architecture, and
+regularization techniques. While acknowledging that the current implementation
+of the method is prohibitively expensive for many deep-learning applications,
+the high cost may already be justified in specific fields like medical
+predictions or astrophysics, where a reliable uncertainty estimate for a single
+prediction is essential. This work highlights the significant potential of a
+likelihood-ratio-based uncertainty estimate and establishes a promising avenue
+for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Spanish Clinical Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillem García Subies, Álvaro Barbero Jiménez, Paloma Martínez Fernández
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey focuses in encoder Language Models for solving tasks in the
+clinical domain in the Spanish language. We review the contributions of 17
+corpora focused mainly in clinical tasks, then list the most relevant Spanish
+Language Models and Spanish Clinical Language models. We perform a thorough
+comparison of these models by benchmarking them over a curated subset of the
+available corpora, in order to find the best-performing ones; in total more
+than 3000 models were fine-tuned for this study. All the tested corpora and the
+best models are made publically available in an accessible way, so that the
+results can be reproduced by independent teams or challenged in the future when
+new Spanish Clinical Language models are created.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoML4ETC: Automated Neural Architecture Search for Real-World
+  Encrypted Traffic Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Malekghaini, Elham Akbari, Mohammad A. Salahuddin, Noura Limam, Raouf Boutaba, Bertrand Mathieu, Stephanie Moteau, Stephane Tuffin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) has been successfully applied to encrypted network traffic
+classification in experimental settings. However, in production use, it has
+been shown that a DL classifier's performance inevitably decays over time.
+Re-training the model on newer datasets has been shown to only partially
+improve its performance. Manually re-tuning the model architecture to meet the
+performance expectations on newer datasets is time-consuming and requires
+domain expertise. We propose AutoML4ETC, a novel tool to automatically design
+efficient and high-performing neural architectures for encrypted traffic
+classification. We define a novel, powerful search space tailored specifically
+for the near real-time classification of encrypted traffic using packet header
+bytes. We show that with different search strategies over our search space,
+AutoML4ETC generates neural architectures that outperform the state-of-the-art
+encrypted traffic classifiers on several datasets, including public benchmark
+datasets and real-world TLS and QUIC traffic collected from the Orange mobile
+network. In addition to being more accurate, AutoML4ETC's architectures are
+significantly more efficient and lighter in terms of the number of parameters.
+Finally, we make AutoML4ETC publicly available for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case
+  Study in Oncology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cliff Wong, Sheng Zheng, Yu Gu, Christine Moung, Jacob Abel, Naoto Usuyama, Roshanthi Weerasinghe, Brian Piening, Tristan Naumann, Carlo Bifulco, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trial matching is a key process in health delivery and discovery. In
+practice, it is plagued by overwhelming unstructured data and unscalable manual
+processing. In this paper, we conduct a systematic study on scaling clinical
+trial matching using large language models (LLMs), with oncology as the focus
+area. Our study is grounded in a clinical trial matching system currently in
+test deployment at a large U.S. health network. Initial findings are promising:
+out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate
+eligibility criteria of clinical trials and extract complex matching logic
+(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially
+outperform prior strong baselines and may serve as a preliminary solution to
+help triage patient-trial candidates with humans in the loop. Our study also
+reveals a few significant growth areas for applying LLMs to end-to-end clinical
+trial matching, such as context limitation and accuracy, especially in
+structuring patient information from longitudinal medical records.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted at Machine Learning for Healthcare
+  (MLHC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion probabilistic models enhance variational autoencoder for
+  crystal structure generative modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teerachote Pakornchote, Natthaphon Choomphon-anomakhun, Sorrjit Arrerut, Chayanon Atthapak, Sakarn Khamkaeo, Thiparat Chotibut, Thiti Bovornratanaraks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The crystal diffusion variational autoencoder (CDVAE) is a machine learning
+model that leverages score matching to generate realistic crystal structures
+that preserve crystal symmetry. In this study, we leverage novel diffusion
+probabilistic (DP) models to denoise atomic coordinates rather than adopting
+the standard score matching approach in CDVAE. Our proposed DP-CDVAE model can
+reconstruct and generate crystal structures whose qualities are statistically
+comparable to those of the original CDVAE. Furthermore, notably, when comparing
+the carbon structures generated by the DP-CDVAE model with relaxed structures
+obtained from density functional theory calculations, we find that the DP-CDVAE
+generated structures are remarkably closer to their respective ground states.
+The energy differences between these structures and the true ground states are,
+on average, 68.1 meV/atom lower than those generated by the original CDVAE.
+This significant improvement in the energy accuracy highlights the
+effectiveness of the DP-CDVAE model in generating crystal structures that
+better represent their ground-state configurations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speaker Diarization of Scripted Audiovisual Content 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02160v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02160v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yogesh Virkar, Brian Thompson, Rohit Paturi, Sundararajan Srinivasan, Marcello Federico
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The media localization industry usually requires a verbatim script of the
+final film or TV production in order to create subtitles or dubbing scripts in
+a foreign language. In particular, the verbatim script (i.e. as-broadcast
+script) must be structured into a sequence of dialogue lines each including
+time codes, speaker name and transcript. Current speech recognition technology
+alleviates the transcription step. However, state-of-the-art speaker
+diarization models still fall short on TV shows for two main reasons: (i) their
+inability to track a large number of speakers, (ii) their low accuracy in
+detecting frequent speaker changes. To mitigate this problem, we present a
+novel approach to leverage production scripts used during the shooting process,
+to extract pseudo-labeled data for the speaker diarization task. We propose a
+novel semi-supervised approach and demonstrate improvements of 51.7% relative
+to two unsupervised baseline models on our metrics on a 66 show test set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Order Analysis and Design of Exponential Integrator for
+  Diffusion Models Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinsheng Zhang, Jiaming Song, Yongxin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient differential equation solvers have significantly reduced the
+sampling time of diffusion models (DMs) while retaining high sampling quality.
+Among these solvers, exponential integrators (EI) have gained prominence by
+demonstrating state-of-the-art performance. However, existing high-order
+EI-based sampling algorithms rely on degenerate EI solvers, resulting in
+inferior error bounds and reduced accuracy in contrast to the theoretically
+anticipated results under optimal settings. This situation makes the sampling
+quality extremely vulnerable to seemingly innocuous design choices such as
+timestep schedules. For example, an inefficient timestep scheduler might
+necessitate twice the number of steps to achieve a quality comparable to that
+obtained through carefully optimized timesteps. To address this issue, we
+reevaluate the design of high-order differential solvers for DMs. Through a
+thorough order analysis, we reveal that the degeneration of existing high-order
+EI solvers can be attributed to the absence of essential order conditions. By
+reformulating the differential equations in DMs and capitalizing on the theory
+of exponential integrators, we propose refined EI solvers that fulfill all the
+order conditions, which we designate as Refined Exponential Solver (RES).
+Utilizing these improved solvers, RES exhibits more favorable error bounds
+theoretically and achieves superior sampling efficiency and stability in
+practical applications. For instance, a simple switch from the single-step
+DPM-Solver++ to our order-satisfied RES solver when Number of Function
+Evaluations (NFE) $=9$, results in a reduction of numerical defects by $25.2\%$
+and FID improvement of $25.4\%$ (16.77 vs 12.51) on a pre-trained ImageNet
+diffusion model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimization on Pareto sets: On a theory of multi-objective optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Roy, Geelon So, Yi-An Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multi-objective optimization, a single decision vector must balance the
+trade-offs between many objectives. Solutions achieving an optimal trade-off
+are said to be Pareto optimal: these are decision vectors for which improving
+any one objective must come at a cost to another. But as the set of Pareto
+optimal vectors can be very large, we further consider a more practically
+significant Pareto-constrained optimization problem, where the goal is to
+optimize a preference function constrained to the Pareto set.
+  We investigate local methods for solving this constrained optimization
+problem, which poses significant challenges because the constraint set is (i)
+implicitly defined, and (ii) generally non-convex and non-smooth, even when the
+objectives are. We define notions of optimality and stationarity, and provide
+an algorithm with a last-iterate convergence rate of $O(K^{-1/2})$ to
+stationarity when the objectives are strongly convex and Lipschitz smooth.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning the solution operator of two-dimensional incompressible
+  Navier-Stokes equations using physics-aware convolutional neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Grimm, Alexander Heinlein, Axel Klawonn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the concept of introducing physics to machine learning has
+become widely popular. Most physics-inclusive ML-techniques however are still
+limited to a single geometry or a set of parametrizable geometries. Thus, there
+remains the need to train a new model for a new geometry, even if it is only
+slightly modified. With this work we introduce a technique with which it is
+possible to learn approximate solutions to the steady-state Navier--Stokes
+equations in varying geometries without the need of parametrization. This
+technique is based on a combination of a U-Net-like CNN and well established
+discretization methods from the field of the finite difference method.The
+results of our physics-aware CNN are compared to a state-of-the-art data-based
+approach. Additionally, it is also shown how our approach performs when
+combined with the data-based approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eva: A General Vectorized Approximation Framework for Second-order
+  Optimization <span class="chip">ICLR2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Zhang, Shaohuai Shi, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Second-order optimization algorithms exhibit excellent convergence properties
+for training deep learning models, but often incur significant computation and
+memory overheads. This can result in lower training efficiency than the
+first-order counterparts such as stochastic gradient descent (SGD). In this
+work, we present a memory- and time-efficient second-order algorithm named Eva
+with two novel techniques: 1) we construct the second-order information with
+the Kronecker factorization of small stochastic vectors over a mini-batch of
+training data to reduce memory consumption, and 2) we derive an efficient
+update formula without explicitly computing the inverse of matrices using the
+Sherman-Morrison formula. We further extend Eva to a general vectorized
+approximation framework to improve the compute and memory efficiency of two
+existing second-order algorithms (FOOF and Shampoo) without affecting their
+convergence performance. Extensive experimental results on different models and
+datasets show that Eva reduces the end-to-end training time up to 2.05x and
+2.42x compared to first-order SGD and second-order algorithms (K-FAC and
+Shampoo), respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extension of ICLR2022 Practical second-order optimization with
+  Kronecker-vectorized approximation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Provenance via Model DNA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Mu, Yu Wang, Yehong Zhang, Jiaqi Zhang, Hui Wang, Yang Xiang, Yue Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the life cycle of the machine learning (ML) model is an
+intriguing area of research (e.g., understanding where the model comes from,
+how it is trained, and how it is used). This paper focuses on a novel problem
+within this field, namely Model Provenance (MP), which concerns the
+relationship between a target model and its pre-training model and aims to
+determine whether a source model serves as the provenance for a target model.
+This is an important problem that has significant implications for ensuring the
+security and intellectual property of machine learning models but has not
+received much attention in the literature. To fill in this gap, we introduce a
+novel concept of Model DNA which represents the unique characteristics of a
+machine learning model. We utilize a data-driven and model-driven
+representation learning method to encode the model's training data and
+input-output information as a compact and comprehensive representation (i.e.,
+DNA) of the model. Using this model DNA, we develop an efficient framework for
+model provenance identification, which enables us to identify whether a source
+model is a pre-training model of a target model. We conduct evaluations on both
+computer vision and natural language processing tasks using various models,
+datasets, and scenarios to demonstrate the effectiveness of our approach in
+accurately identifying model provenance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VQGraph: Graph Vector-Quantization for Bridging GNNs and MLPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yang, Ye Tian, Minkai Xu, Zhongyi Liu, Shenda Hong, Wei Qu, Wentao Zhang, Bin Cui, Muhan Zhang, Jure Leskovec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) conduct message passing which aggregates local
+neighbors to update node representations. Such message passing leads to
+scalability issues in practical latency-constrained applications. To address
+this issue, recent methods adopt knowledge distillation (KD) to learn
+computationally-efficient multi-layer perceptron (MLP) by mimicking the output
+of GNN. However, the existing GNN representation space may not be expressive
+enough for representing diverse local structures of the underlying graph, which
+limits the knowledge transfer from GNN to MLP. Here we present a novel
+framework VQGraph to learn a powerful graph representation space for bridging
+GNNs and MLPs. We adopt the encoder of a variant of a vector-quantized
+variational autoencoder (VQ-VAE) as a structure-aware graph tokenizer, which
+explicitly represents the nodes of diverse local structures as numerous
+discrete tokens and constitutes a meaningful codebook. Equipped with the
+learned codebook, we propose a new token-based distillation objective based on
+soft token assignments to sufficiently transfer the structural knowledge from
+GNN to MLP. Extensive experiments and analyses demonstrate the strong
+performance of VQGraph, where we achieve new state-of-the-art performance on
+GNN-MLP distillation in both transductive and inductive settings across seven
+graph datasets. We show that VQGraph with better performance infers faster than
+GNNs by 828x, and also achieves accuracy improvement over GNNs and stand-alone
+MLPs by 3.90% and 28.05% on average, respectively. Code:
+https://github.com/YangLing0818/VQGraph.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1906.00446 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Breast Ultrasound Tumor Classification Using a Hybrid Multitask
+  CNN-<span class="highlight-title">Transformer</span> Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryar Shareef, Min Xian, Aleksandar Vakanski, Haotian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing global contextual information plays a critical role in breast
+ultrasound (BUS) image classification. Although convolutional neural networks
+(CNNs) have demonstrated reliable performance in tumor classification, they
+have inherent limitations for modeling global and long-range dependencies due
+to the localized nature of convolution operations. Vision Transformers have an
+improved capability of capturing global contextual information but may distort
+the local image patterns due to the tokenization operations. In this study, we
+proposed a hybrid multitask deep neural network called Hybrid-MT-ESTAN,
+designed to perform BUS tumor classification and segmentation using a hybrid
+architecture composed of CNNs and Swin Transformer components. The proposed
+approach was compared to nine BUS classification methods and evaluated using
+seven quantitative metrics on a dataset of 3,320 BUS images. The results
+indicate that Hybrid-MT-ESTAN achieved the highest accuracy, sensitivity, and
+F1 score of 82.7%, 86.4%, and 86.0%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inference-Based Quantum Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.09919v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.09919v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. Huerta Alderete, Max Hunter Gordon, Frederic Sauvage, Akira Sone, Andrew T. Sornborger, Patrick J. Coles, M. Cerezo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a standard Quantum Sensing (QS) task one aims at estimating an unknown
+parameter $\theta$, encoded into an $n$-qubit probe state, via measurements of
+the system. The success of this task hinges on the ability to correlate changes
+in the parameter to changes in the system response $\mathcal{R}(\theta)$ (i.e.,
+changes in the measurement outcomes). For simple cases the form of
+$\mathcal{R}(\theta)$ is known, but the same cannot be said for realistic
+scenarios, as no general closed-form expression exists. In this work we present
+an inference-based scheme for QS. We show that, for a general class of unitary
+families of encoding, $\mathcal{R}(\theta)$ can be fully characterized by only
+measuring the system response at $2n+1$ parameters. This allows us to infer the
+value of an unknown parameter given the measured response, as well as to
+determine the sensitivity of the scheme, which characterizes its overall
+performance. We show that inference error is, with high probability, smaller
+than $\delta$, if one measures the system response with a number of shots that
+scales only as $\Omega(\log^3(n)/\delta^2)$. Furthermore, the framework
+presented can be broadly applied as it remains valid for arbitrary probe states
+and measurement schemes, and, even holds in the presence of quantum noise. We
+also discuss how to extend our results beyond unitary families. Finally, to
+showcase our method we implement it for a QS task on real quantum hardware, and
+in numerical simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7+13 pages, 3+7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GraphCast: Learning skillful medium-range global weather forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12794v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12794v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Remi Lam, Alvaro Sanchez-Gonzalez, Matthew Willson, Peter Wirnsberger, Meire Fortunato, Ferran Alet, Suman Ravuri, Timo Ewalds, Zach Eaton-Rosen, Weihua Hu, Alexander Merose, Stephan Hoyer, George Holland, Oriol Vinyals, Jacklynn Stott, Alexander Pritzel, Shakir Mohamed, Peter Battaglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global medium-range weather forecasting is critical to decision-making across
+many social and economic domains. Traditional numerical weather prediction uses
+increased compute resources to improve forecast accuracy, but cannot directly
+use historical weather data to improve the underlying model. We introduce a
+machine learning-based method called "GraphCast", which can be trained directly
+from reanalysis data. It predicts hundreds of weather variables, over 10 days
+at 0.25 degree resolution globally, in under one minute. We show that GraphCast
+significantly outperforms the most accurate operational deterministic systems
+on 90% of 1380 verification targets, and its forecasts support better severe
+event prediction, including tropical cyclones, atmospheric rivers, and extreme
+temperatures. GraphCast is a key advance in accurate and efficient weather
+forecasting, and helps realize the promise of machine learning for modeling
+complex dynamical systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GraphCast code and trained weights are available at:
+  https://github.com/deepmind/graphcast</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Deep Learning for Intrusion Detection in IoT Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02715v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02715v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Othmane Belarbi, Theodoros Spyridopoulos, Eirini Anthi, Ioannis Mavromatis, Pietro Carnelli, Aftab Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vast increase of Internet of Things (IoT) technologies and the
+ever-evolving attack vectors have increased cyber-security risks dramatically.
+A common approach to implementing AI-based Intrusion Detection systems (IDSs)
+in distributed IoT systems is in a centralised manner. However, this approach
+may violate data privacy and prohibit IDS scalability. Therefore, intrusion
+detection solutions in IoT ecosystems need to move towards a decentralised
+direction. Federated Learning (FL) has attracted significant interest in recent
+years due to its ability to perform collaborative learning while preserving
+data confidentiality and locality. Nevertheless, most FL-based IDS for IoT
+systems are designed under unrealistic data distribution conditions. To that
+end, we design an experiment representative of the real world and evaluate the
+performance of an FL-based IDS. For our experiments, we rely on TON-IoT, a
+realistic IoT network traffic dataset, associating each IP address with a
+single FL client. Additionally, we explore pre-training and investigate various
+aggregation methods to mitigate the impact of data heterogeneity. Lastly, we
+benchmark our approach against a centralised solution. The comparison shows
+that the heterogeneous nature of the data has a considerable negative impact on
+the model's performance when trained in a distributed manner. However, in the
+case of a pre-trained initial global FL model, we demonstrate a performance
+improvement of over 20% (F1-score) compared to a randomly initiated global
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figues, 3 tables. To be presented at the IEEE Global
+  Communications Conference in December 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beating Backdoor Attack at Its Own Game <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15539v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15539v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Alberto Sangiovanni-Vincentelli, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not
+affect the network's performance on clean data but would manipulate the network
+behavior once a trigger pattern is added. Existing defense methods have greatly
+reduced attack success rate, but their prediction accuracy on clean data still
+lags behind a clean model by a large margin. Inspired by the stealthiness and
+effectiveness of backdoor attack, we propose a simple but highly effective
+defense framework which injects non-adversarial backdoors targeting poisoned
+samples. Following the general steps in backdoor attack, we detect a small set
+of suspected samples and then apply a poisoning strategy to them. The
+non-adversarial backdoor, once triggered, suppresses the attacker's backdoor on
+poisoned data, but has limited influence on clean data. The defense can be
+carried out during data preprocessing, without any modification to the standard
+end-to-end training pipeline. We conduct extensive experiments on multiple
+benchmarks with different architectures and representative attacks. Results
+demonstrate that our method achieves state-of-the-art defense effectiveness
+with by far the lowest performance drop on clean data. Considering the
+surprising defense ability displayed by our framework, we call for more
+attention to utilizing backdoor for backdoor defense. Code is available at
+https://github.com/damianliumin/non-adversarial_backdoor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating the Bias of Centered Objects in Common <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.09195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.09195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gergely Szabo, Andras Horvath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional networks are considered shift invariant, but it was
+demonstrated that their response may vary according to the exact location of
+the objects. In this paper we will demonstrate that most commonly investigated
+datasets have a bias, where objects are over-represented at the center of the
+image during training. This bias and the boundary condition of these networks
+can have a significant effect on the performance of these architectures and
+their accuracy drops significantly as an object approaches the boundary. We
+will also demonstrate how this effect can be mitigated with data augmentation
+techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Label Biases for In-context Learning <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19148v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19148v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Fei, Yifan Hou, Zeming Chen, Antoine Bosselut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Various design settings for in-context learning (ICL), such as the choice and
+order of the in-context examples, can bias a model toward a particular
+prediction without being reflective of an understanding of the task. While many
+studies discuss these design choices, there have been few systematic
+investigations into categorizing them and mitigating their impact. In this
+work, we define a typology for three types of label biases in ICL for text
+classification: vanilla-label bias, context-label bias, and domain-label bias
+(which we conceptualize and detect for the first time).
+  Our analysis demonstrates that prior label bias calibration methods fall
+short of addressing all three types of biases. Specifically, domain-label bias
+restricts LLMs to random-level performance on many tasks regardless of the
+choice of in-context examples. To mitigate the effect of these biases, we
+propose a simple bias calibration method that estimates a language model's
+label bias using random in-domain words from the task corpus. After controlling
+for this estimated bias when making predictions, our novel domain-context
+calibration significantly improves the ICL performance of GPT-J and GPT-3 on a
+wide range of tasks. The gain is substantial on tasks with large domain-label
+bias (up to 37% in Macro-F1). Furthermore, our results generalize to models
+with different scales, pretraining methods, and manually-designed task
+instructions, showing the prevalence of label biases in ICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Traffic Forecasting on New Roads Unseen in the Training Data Using
+  Spatial Contrastive <span class="highlight-title">Pre-Train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arian Prabowo, Wei Shao, Hao Xue, Piotr Koniusz, Flora D. Salim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New roads are being constructed all the time. However, the capabilities of
+previous deep forecasting models to generalize to new roads not seen in the
+training data (unseen roads) are rarely explored. In this paper, we introduce a
+novel setup called a spatio-temporal (ST) split to evaluate the models'
+capabilities to generalize to unseen roads. In this setup, the models are
+trained on data from a sample of roads, but tested on roads not seen in the
+training data. Moreover, we also present a novel framework called Spatial
+Contrastive Pre-Training (SCPT) where we introduce a spatial encoder module to
+extract latent features from unseen roads during inference time. This spatial
+encoder is pre-trained using contrastive learning. During inference, the
+spatial encoder only requires two days of traffic data on the new roads and
+does not require any re-training. We also show that the output from the spatial
+encoder can be used effectively to infer latent node embeddings on unseen roads
+during inference time. The SCPT framework also incorporates a new layer, named
+the spatially gated addition (SGA) layer, to effectively combine the latent
+features from the output of the spatial encoder to existing backbones.
+Additionally, since there is limited data on the unseen roads, we argue that it
+is better to decouple traffic signals to trivial-to-capture periodic signals
+and difficult-to-capture Markovian signals, and for the spatial encoder to only
+learn the Markovian signals. Finally, we empirically evaluated SCPT using the
+ST split setup on four real-world datasets. The results showed that adding SCPT
+to a backbone consistently improves forecasting performance on unseen roads.
+More importantly, the improvements are greater when forecasting further into
+the future. The codes are available on GitHub:
+\burl{https://github.com/cruiseresearchgroup/forecasting-on-new-roads}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages including reference, an additional 3 pages of appendix, 8
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parameter estimation from an Ornstein-Uhlenbeck process with measurement
+  noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Carter, Helmut H. Strey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article aims to investigate the impact of noise on parameter fitting for
+an Ornstein-Uhlenbeck process, focusing on the effects of multiplicative and
+thermal noise on the accuracy of signal separation. To address these issues, we
+propose algorithms and methods that can effectively distinguish between thermal
+and multiplicative noise and improve the precision of parameter estimation for
+optimal data analysis. Specifically, we explore the impact of both
+multiplicative and thermal noise on the obfuscation of the actual signal and
+propose methods to resolve them. Firstly, we present an algorithm that can
+effectively separate thermal noise with comparable performance to Hamilton
+Monte Carlo (HMC) but with significantly improved speed. Subsequently, we
+analyze multiplicative noise and demonstrate that HMC is insufficient for
+isolating thermal and multiplicative noise. However, we show that, with
+additional knowledge of the ratio between thermal and multiplicative noise, we
+can accurately distinguish between the two types of noise when provided with a
+sufficiently large sampling rate or an amplitude of multiplicative noise
+smaller than thermal noise. This finding results in a situation that initially
+seems counterintuitive. When multiplicative noise dominates the noise spectrum,
+we can successfully estimate the parameters for such systems after adding
+additional white noise to shift the noise balance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-grained Species Recognition with Privileged Pooling: Better Sample
+  Efficiency Through Supervised Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.09168v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.09168v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres C. Rodriguez, Stefano D'Aronco, Konrad Schindler, Jan Dirk Wegner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a scheme for supervised image classification that uses privileged
+information, in the form of keypoint annotations for the training data, to
+learn strong models from small and/or biased training sets. Our main motivation
+is the recognition of animal species for ecological applications such as
+biodiversity modelling, which is challenging because of long-tailed species
+distributions due to rare species, and strong dataset biases such as repetitive
+scene background in camera traps. To counteract these challenges, we propose a
+visual attention mechanism that is supervised via keypoint annotations that
+highlight important object parts. This privileged information, implemented as a
+novel privileged pooling operation, is only required during training and helps
+the model to focus on regions that are discriminative. In experiments with
+three different animal species datasets, we show that deep networks with
+privileged pooling can use small training sets more efficiently and generalize
+better.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated version with iNaturalist2018 dataset. privileged pooling,
+  supervised attention, training set bias, fine-grained species recognition,
+  camera trap images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven modeling of Landau damping by physics-informed neural
+  networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01021v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01021v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilan Qin, Jiayu Ma, Mingle Jiang, Chuanfei Dong, Haiyang Fu, Liang Wang, Wenjie Cheng, Yaqiu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kinetic approaches are generally accurate in dealing with microscale plasma
+physics problems but are computationally expensive for large-scale or
+multiscale systems. One of the long-standing problems in plasma physics is the
+integration of kinetic physics into fluid models, which is often achieved
+through sophisticated analytical closure terms. In this paper, we successfully
+construct a multi-moment fluid model with an implicit fluid closure included in
+the neural network using machine learning. The multi-moment fluid model is
+trained with a small fraction of sparsely sampled data from kinetic simulations
+of Landau damping, using the physics-informed neural network (PINN) and the
+gradient-enhanced physics-informed neural network (gPINN). The multi-moment
+fluid model constructed using either PINN or gPINN reproduces the time
+evolution of the electric field energy, including its damping rate, and the
+plasma dynamics from the kinetic simulations. In addition, we introduce a
+variant of the gPINN architecture, namely, gPINN$p$ to capture the Landau
+damping process. Instead of including the gradients of all the equation
+residuals, gPINN$p$ only adds the gradient of the pressure equation residual as
+one additional constraint. Among the three approaches, the gPINN$p$-constructed
+multi-moment fluid model offers the most accurate results. This work sheds
+light on the accurate and efficient modeling of large-scale systems, which can
+be extended to complex multiscale laboratory, space, and astrophysical plasma
+physics problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures, accepted for publication in Physical Review
+  Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Multiple Abstractions in Episodic RL via Reward Shaping <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00516v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00516v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Cipollone, Giuseppe De Giacomo, Marco Favorito, Luca Iocchi, Fabio Patrizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One major limitation to the applicability of Reinforcement Learning (RL) to
+many practical domains is the large number of samples required to learn an
+optimal policy. To address this problem and improve learning efficiency, we
+consider a linear hierarchy of abstraction layers of the Markov Decision
+Process (MDP) underlying the target domain. Each layer is an MDP representing a
+coarser model of the one immediately below in the hierarchy. In this work, we
+propose a novel form of Reward Shaping where the solution obtained at the
+abstract level is used to offer rewards to the more concrete MDP, in such a way
+that the abstract solution guides the learning in the more complex domain. In
+contrast with other works in Hierarchical RL, our technique has few
+requirements in the design of the abstract models and it is also tolerant to
+modeling errors, thus making the proposed approach practical. We formally
+analyze the relationship between the abstract models and the exploration
+heuristic induced in the lower-level domain. Moreover, we prove that the method
+guarantees optimal convergence and we demonstrate its effectiveness
+experimentally.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an extended version of the paper presented at AAAI 2023,
+  https://doi.org/10.1609/aaai.v37i6.25881</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A simple probabilistic neural networks for machine understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13179v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13179v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongrong Xie, Matteo Marsili
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We discuss probabilistic neural networks for unsupervised learning with a
+fixed internal representation as models for machine understanding. Here
+understanding is intended as mapping data to an already existing representation
+which encodes an {\em a priori} organisation of the feature space. We derive
+the internal representation by requiring that it satisfies the principles of
+maximal relevance and of maximal ignorance about how different features are
+combined. We show that, when hidden units are binary variables, these two
+principles identify a unique model -- the Hierarchical Feature Model (HFM) --
+which is fully solvable and provides a natural interpretation in terms of
+features. We argue that learning machines with this architecture enjoy a number
+of interesting properties, like the continuity of the representation with
+respect to changes in parameters and data, the possibility to control the level
+of compression and the ability to support functions that go beyond
+generalisation. We explore the behaviour of the model with extensive numerical
+experiments and argue that models where the internal representation is fixed
+reproduce a learning modality which is qualitatively different from that of
+more traditional models such as Restricted Boltzmann Machines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mondrian Forest for Data Stream Classification Under Memory Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.07871v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.07871v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Khannouz, Tristan Glatard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised learning algorithms generally assume the availability of enough
+memory to store their data model during the training and test phases. However,
+in the Internet of Things, this assumption is unrealistic when data comes in
+the form of infinite data streams, or when learning algorithms are deployed on
+devices with reduced amounts of memory. In this paper, we adapt the online
+Mondrian forest classification algorithm to work with memory constraints on
+data streams. In particular, we design five out-of-memory strategies to update
+Mondrian trees with new data points when the memory limit is reached. Moreover,
+we design trimming mechanisms to make Mondrian trees more robust to concept
+drifts under memory constraints. We evaluate our algorithms on a variety of
+real and simulated datasets, and we conclude with recommendations on their use
+in different situations: the Extend Node strategy appears as the best
+out-of-memory strategy in all configurations, whereas different trimming
+mechanisms should be adopted depending on whether a concept drift is expected.
+All our methods are implemented in the OrpailleCC open-source library and are
+ready to be used on embedded systems and connected objects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Initial State Interventions for Deconfounded Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Pfrommer, Yatong Bai, Hyunin Lee, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning suffers from causal confusion. This phenomenon occurs when
+learned policies attend to features that do not causally influence the expert
+actions but are instead spuriously correlated. Causally confused agents produce
+low open-loop supervised loss but poor closed-loop performance upon deployment.
+We consider the problem of masking observed confounders in a disentangled
+representation of the observation space. Our novel masking algorithm leverages
+the usual ability to intervene in the initial system state, avoiding any
+requirement involving expert querying, expert reward functions, or causal graph
+specification. Under certain assumptions, we theoretically prove that this
+algorithm is conservative in the sense that it does not incorrectly mask
+observations that causally influence the expert; furthermore, intervening on
+the initial state serves to strictly reduce excess conservatism. The masking
+algorithm is applied to behavior cloning for two illustrative control systems:
+CartPole and Reacher.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>62nd IEEE Conference on Decision and Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning minimal representations of stochastic processes with
+  variational autoencoders <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11608v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11608v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Fernández-Fernández, Carlo Manzo, Maciej Lewenstein, Alexandre Dauphin, Gorka Muñoz-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic processes have found numerous applications in science, as they are
+broadly used to model a variety of natural phenomena. Due to their intrinsic
+randomness and uncertainty, they are however difficult to characterize. Here,
+we introduce an unsupervised machine learning approach to determine the minimal
+set of parameters required to effectively describe the dynamics of a stochastic
+process. Our method builds upon an extended $\beta$-variational autoencoder
+architecture. By means of simulated datasets corresponding to paradigmatic
+diffusion models, we showcase its effectiveness in extracting the minimal
+relevant parameters that accurately describe these dynamics. Furthermore, the
+method enables the generation of new trajectories that faithfully replicate the
+expected stochastic behavior. Overall, our approach enables for the autonomous
+discovery of unknown parameters describing stochastic processes, hence
+enhancing our comprehension of complex phenomena across various fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 1 table. Code available at
+  https://github.com/GabrielFernandezFernandez/SPIVAE . Corrected a reference,
+  a typographical error in the appendix, and acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable Contextual Anomaly Detection using Quantile Regression
+  Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Li, Matthijs van Leeuwen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional anomaly detection methods aim to identify objects that deviate
+from most other objects by treating all features equally. In contrast,
+contextual anomaly detection methods aim to detect objects that deviate from
+other objects within a context of similar objects by dividing the features into
+contextual features and behavioral features. In this paper, we develop
+connections between dependency-based traditional anomaly detection methods and
+contextual anomaly detection methods. Based on resulting insights, we propose a
+novel approach to inherently interpretable contextual anomaly detection that
+uses Quantile Regression Forests to model dependencies between features.
+Extensive experiments on various synthetic and real-world datasets demonstrate
+that our method outperforms state-of-the-art anomaly detection methods in
+identifying contextual anomalies in terms of accuracy and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted by Data Mining and Knowledge Discovery for
+  publication (June 2023). This is the final revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in
+  Patients With Suspected Ischemic Stroke 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Tomasetti, Kjersti Engan, Liv Jorunn Høllesli, Kathinka Dæhli Kurz, Mahdieh Khanmohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise and fast prediction methods for ischemic areas comprised of dead
+tissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)
+patients are of significant clinical interest. They play an essential role in
+improving diagnosis and treatment planning. Computed Tomography (CT) scan is
+one of the primary modalities for early assessment in patients with suspected
+AIS. CT Perfusion (CTP) is often used as a primary assessment to determine
+stroke location, severity, and volume of ischemic lesions. Current automatic
+segmentation methods for CTP mostly use already processed 3D parametric maps
+conventionally used for clinical interpretation by radiologists as input.
+Alternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time
+input, where the spatial information over the volume is ignored. In addition,
+these methods are only interested in segmenting core regions, while predicting
+penumbra can be essential for treatment planning. This paper investigates
+different methods to utilize the entire 4D CTP as input to fully exploit the
+spatio-temporal information, leading us to propose a novel 4D convolution
+layer. Our comprehensive experiments on a local dataset of 152 patients divided
+into three groups show that our proposed models generate more precise results
+than other methods explored. Adopting the proposed 4D mJ-Net, a Dice
+Coefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core
+areas, respectively. The code is available on
+https://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ENCODE: Encoding NetFlows for Network Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03890v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03890v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clinton Cao, Annibale Panichella, Sicco Verwer, Agathe Blaise, Filippo Rebecchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NetFlow data is a popular network log format used by many network analysts
+and researchers. The advantages of using NetFlow over deep packet inspection
+are that it is easier to collect and process, and it is less privacy intrusive.
+Many works have used machine learning to detect network attacks using NetFlow
+data. The first step for these machine learning pipelines is to pre-process the
+data before it is given to the machine learning algorithm. Many approaches
+exist to pre-process NetFlow data; however, these simply apply existing methods
+to the data, not considering the specific properties of network data. We argue
+that for data originating from software systems, such as NetFlow or software
+logs, similarities in frequency and contexts of feature values are more
+important than similarities in the value itself. In this work, we propose an
+encoding algorithm that directly takes the frequency and the context of the
+feature values into account when the data is being processed. Different types
+of network behaviours can be clustered using this encoding, thus aiding the
+process of detecting anomalies within the network. We train several machine
+learning models for anomaly detection using the data that has been encoded with
+our encoding algorithm. We evaluate the effectiveness of our encoding on a new
+dataset that we created for network attacks on Kubernetes clusters and two
+well-known public NetFlow datasets. We empirically demonstrate that the machine
+learning models benefit from using our encoding for anomaly detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating generation of chaotic time series by convolutional generative
+  adversarial networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Tanaka, Yutaka Yamaguti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To understand the ability and limitations of convolutional neural networks to
+generate time series that mimic complex temporal signals, we trained a
+generative adversarial network consisting of deep convolutional networks to
+generate chaotic time series and used nonlinear time series analysis to
+evaluate the generated time series. A numerical measure of determinism and the
+Lyapunov exponent, a measure of trajectory instability, showed that the
+generated time series well reproduce the chaotic properties of the original
+time series. However, error distribution analyses showed that large errors
+appeared at a low but non-negligible rate. Such errors would not be expected if
+the distribution were assumed to be exponential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPeC: A Soft <span class="highlight-title">Prompt</span>-Based Calibration on Performance Variability of
+  Large Language Model in Clinical Notes Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13035v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13035v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Neng Chuang, Ruixiang Tang, Xiaoqian Jiang, Xia Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electronic health records (EHRs) store an extensive array of patient
+information, encompassing medical histories, diagnoses, treatments, and test
+outcomes. These records are crucial for enabling healthcare providers to make
+well-informed decisions regarding patient care. Summarizing clinical notes
+further assists healthcare professionals in pinpointing potential health risks
+and making better-informed decisions. This process contributes to reducing
+errors and enhancing patient outcomes by ensuring providers have access to the
+most pertinent and current patient data. Recent research has shown that
+incorporating prompts with large language models (LLMs) substantially boosts
+the efficacy of summarization tasks. However, we show that this approach also
+leads to increased output variance, resulting in notably divergent outputs even
+when prompts share similar meanings. To tackle this challenge, we introduce a
+model-agnostic Soft Prompt-Based Calibration (SPeC) pipeline that employs soft
+prompts to diminish variance while preserving the advantages of prompt-based
+summarization. Experimental findings on multiple clinical note tasks and LLMs
+indicate that our method not only bolsters performance but also effectively
+curbs variance for various LLMs, providing a more uniform and dependable
+solution for summarizing vital medical information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SE(3) symmetry lets graph neural networks learn arterial velocity
+  estimation from small <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Suk, Christoph Brune, Jelmer M. Wolterink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hemodynamic velocity fields in coronary arteries could be the basis of
+valuable biomarkers for diagnosis, prognosis and treatment planning in
+cardiovascular disease. Velocity fields are typically obtained from
+patient-specific 3D artery models via computational fluid dynamics (CFD).
+However, CFD simulation requires meticulous setup by experts and is
+time-intensive, which hinders large-scale acceptance in clinical practice. To
+address this, we propose graph neural networks (GNN) as an efficient black-box
+surrogate method to estimate 3D velocity fields mapped to the vertices of
+tetrahedral meshes of the artery lumen. We train these GNNs on synthetic artery
+models and CFD-based ground truth velocity fields. Once the GNN is trained,
+velocity estimates in a new and unseen artery can be obtained with 36-fold
+speed-up compared to CFD. We demonstrate how to construct an SE(3)-equivariant
+GNN that is independent of the spatial orientation of the input mesh and show
+how this reduces the necessary amount of training data compared to a baseline
+neural network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First published in "12th International Conference on Functional
+  Imaging and Modeling of the Heart" (FIMH), pp 445-454, 2023 by Springer
+  Nature</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Competing for Shareable Arms in Multi-Player Multi-Armed Bandits <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renzhe Xu, Haotian Wang, Xingxuan Zhang, Bo Li, Peng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Competitions for shareable and limited resources have long been studied with
+strategic agents. In reality, agents often have to learn and maximize the
+rewards of the resources at the same time. To design an individualized
+competing policy, we model the competition between agents in a novel
+multi-player multi-armed bandit (MPMAB) setting where players are selfish and
+aim to maximize their own rewards. In addition, when several players pull the
+same arm, we assume that these players averagely share the arms' rewards by
+expectation. Under this setting, we first analyze the Nash equilibrium when
+arms' rewards are known. Subsequently, we propose a novel Selfish MPMAB with
+Averaging Allocation (SMAA) approach based on the equilibrium. We theoretically
+demonstrate that SMAA could achieve a good regret guarantee for each player
+when all players follow the algorithm. Additionally, we establish that no
+single selfish player can significantly increase their rewards through
+deviation, nor can they detrimentally affect other players' rewards without
+incurring substantial losses for themselves. We finally validate the
+effectiveness of the method in extensive synthetic experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Informative is the Approximation Error from Tensor Decomposition for
+  Neural Network Compression? <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05318v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05318v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jetze T. Schuurmans, Kim Batselier, Julian F. P. Kooij
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor decompositions have been successfully applied to compress neural
+networks. The compression algorithms using tensor decompositions commonly
+minimize the approximation error on the weights. Recent work assumes the
+approximation error on the weights is a proxy for the performance of the model
+to compress multiple layers and fine-tune the compressed model. Surprisingly,
+little research has systematically evaluated which approximation errors can be
+used to make choices regarding the layer, tensor decomposition method, and
+level of compression. To close this gap, we perform an experimental study to
+test if this assumption holds across different layers and types of
+decompositions, and what the effect of fine-tuning is. We include the
+approximation error on the features resulting from a compressed layer in our
+analysis to test if this provides a better proxy, as it explicitly takes the
+data into account. We find the approximation error on the weights has a
+positive correlation with the performance error, before as well as after
+fine-tuning. Basing the approximation error on the features does not improve
+the correlation significantly. While scaling the approximation error commonly
+is used to account for the different sizes of layers, the average correlation
+across layers is smaller than across all choices (i.e. layers, decompositions,
+and level of compression) before fine-tuning. When calculating the correlation
+across the different decompositions, the average rank correlation is larger
+than across all choices. This means multiple decompositions can be considered
+for compression and the approximation error can be used to choose between them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ICLR 2023. Appendix A.5 was added
+  after the conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differential Evolution Algorithm based Hyper-Parameters Selection of
+  <span class="highlight-title">Transformer</span> Neural Network Model for Load Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15299v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15299v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anuvab Sen, Arul Rhik Mazumder, Udayon Sen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate load forecasting plays a vital role in numerous sectors, but
+accurately capturing the complex dynamics of dynamic power systems remains a
+challenge for traditional statistical models. For these reasons, time-series
+models (ARIMA) and deep-learning models (ANN, LSTM, GRU, etc.) are commonly
+deployed and often experience higher success. In this paper, we analyze the
+efficacy of the recently developed Transformer-based Neural Network model in
+Load forecasting. Transformer models have the potential to improve Load
+forecasting because of their ability to learn long-range dependencies derived
+from their Attention Mechanism. We apply several metaheuristics namely
+Differential Evolution to find the optimal hyperparameters of the
+Transformer-based Neural Network to produce accurate forecasts. Differential
+Evolution provides scalable, robust, global solutions to non-differentiable,
+multi-objective, or constrained optimization problems. Our work compares the
+proposed Transformer based Neural Network model integrated with different
+metaheuristic algorithms by their performance in Load forecasting based on
+numerical metrics such as Mean Squared Error (MSE) and Mean Absolute Percentage
+Error (MAPE). Our findings demonstrate the potential of metaheuristic-enhanced
+Transformer-based Neural Network models in Load forecasting accuracy and
+provide optimal hyperparameters for each model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 Pages, 6 Figures, 2 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Algorithms for Bandit with Graph Feedback via Regret
+  Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.15076v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.15076v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen He, Chihao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of bandit with graph feedback generalizes both the multi-armed
+bandit (MAB) problem and the learning with expert advice problem by encoding in
+a directed graph how the loss vector can be observed in each round of the game.
+The mini-max regret is closely related to the structure of the feedback graph
+and their connection is far from being fully understood. We propose a new
+algorithmic framework for the problem based on a partition of the feedback
+graph. Our analysis reveals the interplay between various parts of the graph by
+decomposing the regret to the sum of the regret caused by small parts and the
+regret caused by their interaction. As a result, our algorithm can be viewed as
+an interpolation and generalization of the optimal algorithms for MAB and
+learning with expert advice. Our framework unifies previous algorithms for both
+strongly observable graphs and weakly observable graphs, resulting in improved
+and optimal regret bounds on a wide range of graph families including graphs of
+bounded degree and strongly observable graphs with a few corrupted arms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Networks are Inherently Good Generalizers: Insights by
+  Bridging GNNs and MLPs <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09034v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09034v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxiao Yang, Qitian Wu, Jiahua Wang, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs), as the de-facto model class for representation
+learning on graphs, are built upon the multi-layer perceptrons (MLP)
+architecture with additional message passing layers to allow features to flow
+across nodes. While conventional wisdom commonly attributes the success of GNNs
+to their advanced expressivity, we conjecture that this is not the main cause
+of GNNs' superiority in node-level prediction tasks. This paper pinpoints the
+major source of GNNs' performance gain to their intrinsic generalization
+capability, by introducing an intermediate model class dubbed as
+P(ropagational)MLP, which is identical to standard MLP in training, but then
+adopts GNN's architecture in testing. Intriguingly, we observe that PMLPs
+consistently perform on par with (or even exceed) their GNN counterparts, while
+being much more efficient in training. This finding sheds new insights into
+understanding the learning behavior of GNNs, and can be used as an analytic
+tool for dissecting various GNN-related research problems. As an initial step
+to analyze the inherent generalizability of GNNs, we show the essential
+difference between MLP and PMLP at infinite-width limit lies in the NTK feature
+map in the post-training stage. Moreover, by examining their extrapolation
+behavior, we find that though many GNNs and their PMLP counterparts cannot
+extrapolate non-linear functions for extremely out-of-distribution samples,
+they have greater potential to generalize to testing samples near the training
+data range as natural advantages of GNN architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICLR 2023. Codes in https://github.com/chr26195/PMLP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Interpolating Experts and Multi-Armed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Houshuang Chen, Yuchen He, Chihao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with expert advice and multi-armed bandit are two classic online
+decision problems which differ on how the information is observed in each round
+of the game. We study a family of problems interpolating the two. For a vector
+$\mathbf{m}=(m_1,\dots,m_K)\in \mathbb{N}^K$, an instance of $\mathbf{m}$-MAB
+indicates that the arms are partitioned into $K$ groups and the $i$-th group
+contains $m_i$ arms. Once an arm is pulled, the losses of all arms in the same
+group are observed. We prove tight minimax regret bounds for $\mathbf{m}$-MAB
+and design an optimal PAC algorithm for its pure exploration version,
+$\mathbf{m}$-BAI, where the goal is to identify the arm with minimum loss with
+as few rounds as possible. We show that the minimax regret of $\mathbf{m}$-MAB
+is $\Theta\left(\sqrt{T\sum_{k=1}^K\log (m_k+1)}\right)$ and the minimum number
+of pulls for an $(\epsilon,0.05)$-PAC algorithm of $\mathbf{m}$-BAI is
+$\Theta\left(\frac{1}{\epsilon^2}\cdot \sum_{k=1}^K\log (m_k+1)\right)$. Both
+our upper bounds and lower bounds for $\mathbf{m}$-MAB can be extended to a
+more general setting, namely the bandit with graph feedback, in terms of the
+clique cover and related graph parameters. As consequences, we obtained tight
+minimax regret bounds for several families of feedback graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LeCo: Lightweight Compression via Learning Serial Correlations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Liu, Xinyu Zeng, Huanchen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lightweight data compression is a key technique that allows column stores to
+exhibit superior performance for analytical queries. Despite a comprehensive
+study on dictionary-based encodings to approach Shannon's entropy, few prior
+works have systematically exploited the serial correlation in a column for
+compression. In this paper, we propose LeCo (i.e., Learned Compression), a
+framework that uses machine learning to remove the serial redundancy in a value
+sequence automatically to achieve an outstanding compression ratio and
+decompression performance simultaneously. LeCo presents a general approach to
+this end, making existing (ad-hoc) algorithms such as Frame-of-Reference (FOR),
+Delta Encoding, and Run-Length Encoding (RLE) special cases under our
+framework. Our microbenchmark with three synthetic and six real-world data sets
+shows that a prototype of LeCo achieves a Pareto improvement on both
+compression ratio and random access speed over the existing solutions. When
+integrating LeCo into widely-used applications, we observe up to 3.9x speed up
+in filter-scanning a Parquet file and a 16% increase in Rocksdb's throughput.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ridge Estimation with Nonlinear Transformations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05722v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05722v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Zhai, Hengchao Chen, Zhigang Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ridge estimation is an important manifold learning technique. The goal of
+this paper is to examine the effects of nonlinear transformations on the ridge
+sets. The main result proves the inclusion relationship between ridges:
+$\cR(f\circ p)\subseteq \cR(p)$, provided that the transformation $f$ is
+strictly increasing and concave on the range of the function $p$. Additionally,
+given an underlying true manifold $\cM$, we show that the Hausdorff distance
+between $\cR(f\circ p)$ and its projection onto $\cM$ is smaller than the
+Hausdorff distance between $\cR(p)$ and the corresponding projection. This
+motivates us to apply an increasing and concave transformation before the ridge
+estimation. In specific, we show that the power transformations
+$f^{q}(y)=y^q/q,-\infty<q\leq 1$ are increasing and concave on $\RR_+$, and
+thus we can use such power transformations when $p$ is strictly positive.
+Numerical experiments demonstrate the advantages of the proposed methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A New Basis for Sparse Principal Component Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2007.00596v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2007.00596v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Chen, Karl Rohe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous versions of sparse principal component analysis (PCA) have presumed
+that the eigen-basis (a $p \times k$ matrix) is approximately sparse. We
+propose a method that presumes the $p \times k$ matrix becomes approximately
+sparse after a $k \times k$ rotation. The simplest version of the algorithm
+initializes with the leading $k$ principal components. Then, the principal
+components are rotated with an $k \times k$ orthogonal rotation to make them
+approximately sparse. Finally, soft-thresholding is applied to the rotated
+principal components. This approach differs from prior approaches because it
+uses an orthogonal rotation to approximate a sparse basis. One consequence is
+that a sparse component need not to be a leading eigenvector, but rather a
+mixture of them. In this way, we propose a new (rotated) basis for sparse PCA.
+In addition, our approach avoids "deflation" and multiple tuning parameters
+required for that. Our sparse PCA framework is versatile; for example, it
+extends naturally to a two-way analysis of a data matrix for simultaneous
+dimensionality reduction of rows and columns. We provide evidence showing that
+for the same level of sparsity, the proposed sparse PCA method is more stable
+and can explain more variance compared to alternative methods. Through three
+applications -- sparse coding of images, analysis of transcriptome sequencing
+data, and large-scale clustering of social networks, we demonstrate the modern
+usefulness of sparse PCA in exploring multivariate data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint Channel Estimation and Feedback with Masked Token <span class="highlight-title">Transformer</span>s in
+  Massive MIMO Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06125v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06125v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingming Zhao, Lin Liu, Lifu Liu, Mengke Li, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The downlink channel state information (CSI) estimation and low overhead
+acquisition are the major challenges for massive MIMO systems in frequency
+division duplex to enable high MIMO gain. Recently, numerous studies have been
+conducted to harness the power of deep neural networks for better channel
+estimation and feedback. However, existing methods have yet to fully exploit
+the intrinsic correlation features present in CSI. As a consequence, distinct
+network structures are utilized for handling these two tasks separately. To
+achieve joint channel estimation and feedback, this paper proposes an
+encoder-decoder based network that unveils the intrinsic frequency-domain
+correlation within the CSI matrix. The entire encoder-decoder network is
+utilized for channel compression. To effectively capture and restructure
+correlation features, a self-mask-attention coding is proposed, complemented by
+an active masking strategy designed to improve efficiency. The channel
+estimation is achieved through the decoder part, wherein a lightweight
+multilayer perceptron denoising module is utilized for further accurate
+estimation. Extensive experiments demonstrate that our method not only
+outperforms state-of-the-art channel estimation and feedback techniques in
+joint tasks but also achieves beneficial performance in individual tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying Low-Light Image Enhancement Networks with Relative Loss
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Xiaoguang Di, Junde Wu, Rao Fu, Yong Li, Yue Wang, Yanwu Xu, Guohui Yang, Chunhui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image enhancement is a common technique used to mitigate issues such as
+severe noise, low brightness, low contrast, and color deviation in low-light
+images. However, providing an optimal high-light image as a reference for
+low-light image enhancement tasks is impossible, which makes the learning
+process more difficult than other image processing tasks. As a result, although
+several low-light image enhancement methods have been proposed, most of them
+are either too complex or insufficient in addressing all the issues in
+low-light images. In this paper, to make the learning easier in low-light image
+enhancement, we introduce FLW-Net (Fast and LightWeight Network) and two
+relative loss functions. Specifically, we first recognize the challenges of the
+need for a large receptive field to obtain global contrast and the lack of an
+absolute reference, which limits the simplification of network structures in
+this task. Then, we propose an efficient global feature information extraction
+component and two loss functions based on relative information to overcome
+these challenges. Finally, we conducted comparative experiments to demonstrate
+the effectiveness of the proposed method, and the results confirm that the
+proposed method can significantly reduce the complexity of supervised low-light
+image enhancement networks while improving processing effect. The code is
+available at \url{https://github.com/hitzhangyu/FLW-Net}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MRQ:Support Multiple Quantization Schemes through Model Re-Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manasa Manohara, Sankalp Dayal, Tariq Afzal, Rahul Bakshi, Kahkuen Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the proliferation of diverse hardware accelerators (e.g., NPU, TPU,
+DPU), deploying deep learning models on edge devices with fixed-point hardware
+is still challenging due to complex model quantization and conversion. Existing
+model quantization frameworks like Tensorflow QAT [1], TFLite PTQ [2], and
+Qualcomm AIMET [3] supports only a limited set of quantization schemes (e.g.,
+only asymmetric per-tensor quantization in TF1.x QAT [4]). Accordingly, deep
+learning models cannot be easily quantized for diverse fixed-point hardwares,
+mainly due to slightly different quantization requirements. In this paper, we
+envision a new type of model quantization approach called MRQ (model
+re-quantization), which takes existing quantized models and quickly transforms
+the models to meet different quantization requirements (e.g., asymmetric ->
+symmetric, non-power-of-2 scale -> power-of-2 scale). Re-quantization is much
+simpler than quantizing from scratch because it avoids costly re-training and
+provides support for multiple quantization schemes simultaneously. To minimize
+re-quantization error, we developed a new set of re-quantization algorithms
+including weight correction and rounding error folding. We have demonstrated
+that MobileNetV2 QAT model [7] can be quickly re-quantized into two different
+quantization schemes (i.e., symmetric and symmetric+power-of-2 scale) with less
+than 0.64 units of accuracy loss. We believe our work is the first to leverage
+this concept of re-quantization for model quantization and models obtained from
+the re-quantization process have been successfully deployed on NNA in the Echo
+Show devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 3 tables, TinyML Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable (not just posthoc-explainable) heterogeneous survivor
+  bias-corrected treatment effects for assignment of postdischarge
+  interventions to prevent readmissions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongjing Xia, Joshua C. Chang, Sarah Nowak, Sonya Mahajan, Rohit Mahajan, Ted L. Chang, Carson C. Chow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We used survival analysis to quantify the impact of postdischarge evaluation
+and management (E/M) services in preventing hospital readmission or death. Our
+approach avoids a specific pitfall of applying machine learning to this
+problem, which is an inflated estimate of the effect of interventions, due to
+survivors bias -- where the magnitude of inflation may be conditional on
+heterogeneous confounders in the population. This bias arises simply because in
+order to receive an intervention after discharge, a person must not have been
+readmitted in the intervening period. After deriving an expression for this
+phantom effect, we controlled for this and other biases within an inherently
+interpretable Bayesian survival framework. We identified case management
+services as being the most impactful for reducing readmissions overall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hoodwinked: Deception and Cooperation in a Text-Based Game for Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01404v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01404v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aidan O'Gara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Are current language models capable of deception and lie detection? We study
+this question by introducing a text-based game called $\textit{Hoodwinked}$,
+inspired by Mafia and Among Us. Players are locked in a house and must find a
+key to escape, but one player is tasked with killing the others. Each time a
+murder is committed, the surviving players have a natural language discussion
+then vote to banish one player from the game. We conduct experiments with
+agents controlled by GPT-3, GPT-3.5, and GPT-4 and find evidence of deception
+and lie detection capabilities. The killer often denies their crime and accuses
+others, leading to measurable effects on voting outcomes. More advanced models
+are more effective killers, outperforming smaller models in 18 of 24 pairwise
+comparisons. Secondary metrics provide evidence that this improvement is not
+mediated by different actions, but rather by stronger persuasive skills during
+discussions. To evaluate the ability of AI agents to deceive humans, we make
+this game publicly available at h https://hoodwinked.ai/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added reference for McKenzie 2023; updated acknowledgements</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Labelling of Affective Video <span class="highlight-title">Dataset</span>s via Few-Shot &
+  Multi-Task Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravikiran Parameshwara, Ibrahim Radwan, Akshay Asthana, Iman Abbasnejad, Ramanathan Subramanian, Roland Goecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whilst deep learning techniques have achieved excellent emotion prediction,
+they still require large amounts of labelled training data, which are (a)
+onerous and tedious to compile, and (b) prone to errors and biases. We propose
+Multi-Task Contrastive Learning for Affect Representation (\textbf{MT-CLAR})
+for few-shot affect inference. MT-CLAR combines multi-task learning with a
+Siamese network trained via contrastive learning to infer from a pair of
+expressive facial images (a) the (dis)similarity between the facial
+expressions, and (b) the difference in valence and arousal levels of the two
+faces. We further extend the image-based MT-CLAR framework for automated video
+labelling where, given one or a few labelled video frames (termed
+\textit{support-set}), MT-CLAR labels the remainder of the video for valence
+and arousal. Experiments are performed on the AFEW-VA dataset with multiple
+support-set configurations; moreover, supervised learning on representations
+learnt via MT-CLAR are used for valence, arousal and categorical emotion
+prediction on the AffectNet and AFEW-VA datasets. The results show that valence
+and arousal predictions via MT-CLAR are very comparable to the state-of-the-art
+(SOTA), and we significantly outperform SOTA with a support-set $\approx$6\%
+the size of the video dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, to be published in Proceedings of the 31st ACM
+  International Conference on Multimedia (MM '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Improving Harmonic Sensitivity and Prediction Stability for
+  Singing Melody Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keren Shao, Ke Chen, Taylor Berg-Kirkpatrick, Shlomo Dubnov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In deep learning research, many melody extraction models rely on redesigning
+neural network architectures to improve performance. In this paper, we propose
+an input feature modification and a training objective modification based on
+two assumptions. First, harmonics in the spectrograms of audio data decay
+rapidly along the frequency axis. To enhance the model's sensitivity on the
+trailing harmonics, we modify the Combined Frequency and Periodicity (CFP)
+representation using discrete z-transform. Second, the vocal and non-vocal
+segments with extremely short duration are uncommon. To ensure a more stable
+melody contour, we design a differentiable loss function that prevents the
+model from predicting such segments. We apply these modifications to several
+models, including MSNet, FTANet, and a newly introduced model, PianoNet,
+modified from a piano transcription network. Our experimental results
+demonstrate that the proposed modifications are empirically effective for
+singing melody extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 4 figures, 2 tables, Proceedings of the 24th International
+  Society for Music Information Retrieval Conference, ISMIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video Background Music Generation: <span class="highlight-title">Dataset</span>, Method and Evaluation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Zhuo, Zhaokai Wang, Baisen Wang, Yue Liao, Chenxi Bao, Stanley Peng, Songhao Han, Aixi Zhang, Fei Fang, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music is essential when editing videos, but selecting music manually is
+difficult and time-consuming. Thus, we seek to automatically generate
+background music tracks given video input. This is a challenging task since it
+requires music-video datasets, efficient architectures for video-to-music
+generation, and reasonable metrics, none of which currently exist. To close
+this gap, we introduce a complete recipe including dataset, benchmark model,
+and evaluation metric for video background music generation. We present SymMV,
+a video and symbolic music dataset with various musical annotations. To the
+best of our knowledge, it is the first video-music dataset with rich musical
+annotations. We also propose a benchmark video background music generation
+framework named V-MusProd, which utilizes music priors of chords, melody, and
+accompaniment along with video-music relations of semantic, color, and motion
+features. To address the lack of objective metrics for video-music
+correspondence, we design a retrieval-based metric VMCP built upon a powerful
+video-music representation learning model. Experiments show that with our
+dataset, V-MusProd outperforms the state-of-the-art method in both music
+quality and correspondence with videos. We believe our dataset, benchmark
+model, and evaluation metric will boost the development of video background
+music generation. Our dataset and code are available at
+https://github.com/zhuole1025/SymMV.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SVCNet: Scribble-based Video Colorization Network with Temporal
+  Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhi Zhao, Lai-Man Po, Kangcheng Liu, Xuehui Wang, Wing-Yin Yu, Pengfei Xian, Yujia Zhang, Mengyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a scribble-based video colorization network with
+temporal aggregation called SVCNet. It can colorize monochrome videos based on
+different user-given color scribbles. It addresses three common issues in the
+scribble-based video colorization area: colorization vividness, temporal
+consistency, and color bleeding. To improve the colorization quality and
+strengthen the temporal consistency, we adopt two sequential sub-networks in
+SVCNet for precise colorization and temporal smoothing, respectively. The first
+stage includes a pyramid feature encoder to incorporate color scribbles with a
+grayscale frame, and a semantic feature encoder to extract semantics. The
+second stage finetunes the output from the first stage by aggregating the
+information of neighboring colorized frames (as short-range connections) and
+the first colorized frame (as a long-range connection). To alleviate the color
+bleeding artifacts, we learn video colorization and segmentation
+simultaneously. Furthermore, we set the majority of operations on a fixed small
+image resolution and use a Super-resolution Module at the tail of SVCNet to
+recover original sizes. It allows the SVCNet to fit different image resolutions
+at the inference. Finally, we evaluate the proposed SVCNet on DAVIS and Videvo
+benchmarks. The experimental results demonstrate that SVCNet produces both
+higher-quality and more temporally consistent videos than other well-known
+video colorization approaches. The codes and models can be found at
+https://github.com/zhaoyuzhi/SVCNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by IEEE Transactions on Image Processing (TIP)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentangling Multi-view Representations Beyond Inductive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01634v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01634v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanzhou Ke, Yang Yu, Guoqing Chao, Xiaoli Wang, Chenyang Xu, Shengfeng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view (or -modality) representation learning aims to understand the
+relationships between different view representations. Existing methods
+disentangle multi-view representations into consistent and view-specific
+representations by introducing strong inductive biases, which can limit their
+generalization ability. In this paper, we propose a novel multi-view
+representation disentangling method that aims to go beyond inductive biases,
+ensuring both interpretability and generalizability of the resulting
+representations. Our method is based on the observation that discovering
+multi-view consistency in advance can determine the disentangling information
+boundary, leading to a decoupled learning objective. We also found that the
+consistency can be easily extracted by maximizing the transformation invariance
+and clustering consistency between views. These observations drive us to
+propose a two-stage framework. In the first stage, we obtain multi-view
+consistency by training a consistent encoder to produce semantically-consistent
+representations across views as well as their corresponding pseudo-labels. In
+the second stage, we disentangle specificity from comprehensive representations
+by minimizing the upper bound of mutual information between consistent and
+comprehensive representations. Finally, we reconstruct the original data by
+concatenating pseudo-labels and view-specific representations. Our experiments
+on four multi-view datasets demonstrate that our proposed method outperforms 12
+comparison methods in terms of clustering and classification performance. The
+visualization results also show that the extracted consistency and specificity
+are compact and interpretable. Our code can be found at
+\url{https://github.com/Guanzhou-Ke/DMRIB}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-03T00:00:00Z">2023-08-03</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">45</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning in Large Language Models Through Symbolic Math Word Problems <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vedant Gaur, Nikunj Saunshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have revolutionized NLP by solving downstream
+tasks with little to no labeled data. Despite their versatile abilities, the
+larger question of their ability to reason remains ill-understood. This paper
+addresses reasoning in math word problems (MWPs) by studying symbolic versions
+of the numeric problems, since a symbolic expression is a "concise explanation"
+of the numeric answer. We create and use a symbolic version of the SVAMP
+dataset and find that GPT-3's davinci-002 model also has good zero-shot
+accuracy on symbolic MWPs. To evaluate the faithfulness of the model's
+reasoning, we go beyond accuracy and additionally evaluate the alignment
+between the final answer and the outputted reasoning, which correspond to
+numeric and symbolic answers respectively for MWPs. We explore a self-prompting
+approach to encourage the symbolic reasoning to align with the numeric answer,
+thus equipping the LLM with the ability to provide a concise and verifiable
+reasoning and making it more interpretable. Surprisingly, self-prompting also
+improves the symbolic accuracy to be higher than both the numeric and symbolic
+accuracies, thus providing an ensembling effect. The SVAMP_Sym dataset will be
+released for future research on symbolic math problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Findings of ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How many preprints have actually been printed and why: a case study of
+  computer science preprints on arXiv 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialiang Lin, Yao Yu, Yu Zhou, Zhiyang Zhou, Xiaodong Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preprints play an increasingly critical role in academic communities. There
+are many reasons driving researchers to post their manuscripts to preprint
+servers before formal submission to journals or conferences, but the use of
+preprints has also sparked considerable controversy, especially surrounding the
+claim of priority. In this paper, a case study of computer science preprints
+submitted to arXiv from 2008 to 2017 is conducted to quantify how many
+preprints have eventually been printed in peer-reviewed venues. Among those
+published manuscripts, some are published under different titles and without an
+update to their preprints on arXiv. In the case of these manuscripts, the
+traditional fuzzy matching method is incapable of mapping the preprint to the
+final published version. In view of this issue, we introduce a semantics-based
+mapping method with the employment of Bidirectional Encoder Representations
+from Transformers (BERT). With this new mapping method and a plurality of data
+sources, we find that 66% of all sampled preprints are published under
+unchanged titles and 11% are published under different titles and with other
+modifications. A further analysis was then performed to investigate why these
+preprints but not others were accepted for publication. Our comparison reveals
+that in the field of computer science, published preprints feature adequate
+revisions, multiple authorship, detailed abstract and introduction, extensive
+and authoritative references and available source code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please cite the version of Scientometrics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Athena 2.0: Discourse and User Modeling in Open Domain Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omkar Patil, Lena Reed, Kevin K. Bowden, Juraj Juraska, Wen Cui, Vrindavan Harrison, Rishi Rajasekaran, Angela Ramirez, Cecilia Li, Eduardo Zamora, Phillip Lee, Jeshwanth Bheemanpally, Rohan Pandey, Adwait Ratnaparkhi, Marilyn Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational agents are consistently growing in popularity and many people
+interact with them every day. While many conversational agents act as personal
+assistants, they can have many different goals. Some are task-oriented, such as
+providing customer support for a bank or making a reservation. Others are
+designed to be empathetic and to form emotional connections with the user. The
+Alexa Prize Challenge aims to create a socialbot, which allows the user to
+engage in coherent conversations, on a range of popular topics that will
+interest the user. Here we describe Athena 2.0, UCSC's conversational agent for
+Amazon's Socialbot Grand Challenge 4. Athena 2.0 utilizes a novel
+knowledge-grounded discourse model that tracks the entity links that Athena
+introduces into the dialogue, and uses them to constrain named-entity
+recognition and linking, and coreference resolution. Athena 2.0 also relies on
+a user model to personalize topic selection and other aspects of the
+conversation to individual users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Alexa Prize Proceedings, 2021. Socialbot Grand Challenge 4</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Thespian: Multi-Character Text Role-Playing Game Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Cui, Xiangyu Peng, Mark Riedl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-adventure games and text role-playing games are grand challenges for
+reinforcement learning game playing agents. Text role-playing games are
+open-ended environments where an agent must faithfully play a particular
+character. We consider the distinction between characters and actors, where an
+actor agent has the ability to play multiple characters. We present a framework
+we call a thespian agent that can learn to emulate multiple characters along
+with a soft prompt that can be used to direct it as to which character to play
+at any time. We further describe an attention mechanism that allows the agent
+to learn new characters that are based on previously learned characters in a
+few-shot fashion. We show that our agent outperforms the state of the art agent
+framework in multi-character learning and few-shot learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tag Prediction of Competitive Programming Problems using Deep Learning
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Lokat, Divyam Prajapati, Shubhada Labde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past decade, the amount of research being done in the fields of
+machine learning and deep learning, predominantly in the area of natural
+language processing (NLP), has risen dramatically. A well-liked method for
+developing programming abilities like logic building and problem solving is
+competitive programming. It can be tough for novices and even veteran
+programmers to traverse the wide collection of questions due to the massive
+number of accessible questions and the variety of themes, levels of difficulty,
+and questions offered. In order to help programmers find questions that are
+appropriate for their knowledge and interests, there is a need for an automated
+method. This can be done using automated tagging of the questions using Text
+Classification. Text classification is one of the important tasks widely
+researched in the field of Natural Language Processing. In this paper, we
+present a way to use text classification techniques to determine the domain of
+a competitive programming problem. A variety of models, including are
+implemented LSTM, GRU, and MLP. The dataset has been scraped from Codeforces, a
+major competitive programming website. A total of 2400 problems were scraped
+and preprocessed, which we used as a dataset for our training and testing of
+models. The maximum accuracy reached using our model is 78.0% by MLP(Multi
+Layer Perceptron).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wider and Deeper LLM Networks are Fairer LLM Evaluators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinghua Zhang, Bowen Yu, Haiyang Yu, Yangyu Lv, Tingwen Liu, Fei Huang, Hongbo Xu, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Measuring the quality of responses generated by LLMs is a challenging task,
+particularly when it comes to evaluating whether the response is aligned with
+human preference. A novel approach involves using the LLM itself to make
+evaluation and stabilizing the results through multiple independent
+evaluations, similar to a single-layer narrow LLM network. This network
+consists of a fixed number of neurons, with each neuron being the same LLM. In
+this paper, we draw upon the extensive research on deep neural networks to
+explore whether deeper and wider networks can lead to fairer evaluations.
+Specifically, inspired by the observation that different neurons in a neural
+network are responsible for detecting different concepts, we first adaptively
+generate as many neuron roles as possible for each evaluation sample. Each
+perspective corresponds to the role of a specific LLM neuron in the first
+layer. In subsequent layers, we follow the idea that higher layers in deep
+networks are responsible for more comprehensive features, each layer receives
+representations from all neurons in the previous layer, integrating the locally
+learned evaluation information to obtain a more comprehensive evaluation
+result. Interestingly, this network design resembles the process of academic
+paper reviewing. To validate the effectiveness of our method, we construct the
+largest and most diverse English evaluation benchmark LLMEval$^2$ for LLM
+evaluators, comprising 15 tasks, 8 abilities, and 2,553 samples. Experimental
+results demonstrate that a wider network (involving many reviewers) with 2
+layers (one round of discussion) performs the best, improving kappa correlation
+coefficient from 0.28 to 0.34. We also leverage WideDeep to aid in the
+assessment of Chinese LLMs, which has accelerated the evaluation time by 4.6
+times, resulting in a 60% cost saving. WideDeep achieves a remarkable 93%
+agreement level among humans.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on
+  Class-level Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueying Du, Mingwei Liu, Kaixin Wang, Hanlin Wang, Junwei Liu, Yixuan Chen, Jiayi Feng, Chaofeng Sha, Xin Peng, Yiling Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we make the first attempt to evaluate LLMs in a more
+challenging code generation scenario, i.e. class-level code generation. We
+first manually construct the first class-level code generation benchmark
+ClassEval of 100 class-level Python code generation tasks with approximately
+500 person-hours. Based on it, we then perform the first study of 11
+state-of-the-art LLMs on class-level code generation. Based on our results, we
+have the following main findings. First, we find that all existing LLMs show
+much worse performance on class-level code generation compared to on standalone
+method-level code generation benchmarks like HumanEval; and the method-level
+coding ability cannot equivalently reflect the class-level coding ability among
+LLMs. Second, we find that GPT-4 and GPT-3.5 still exhibit dominate superior
+than other LLMs on class-level code generation, and the second-tier models
+includes Instruct-Starcoder, Instruct-Codegen, and Wizardcoder with very
+similar performance. Third, we find that generating the entire class all at
+once (i.e. holistic generation strategy) is the best generation strategy only
+for GPT-4 and GPT-3.5, while method-by-method generation (i.e. incremental and
+compositional) is better strategies for the other models with limited ability
+of understanding long instructions and utilizing the middle information.
+Lastly, we find the limited model ability of generating method-dependent code
+and discuss the frequent error types in generated classes. Our benchmark is
+available at https://github.com/FudanSELab/ClassEval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curricular Transfer Learning for Sentence Encoded Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jader Martins Camboim de Sá, Matheus Ferraroni Sanches, Rafael Roque de Souza, Júlio Cesar dos Reis, Leandro Aparecido Villas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning language models in a downstream task is the standard approach for
+many state-of-the-art methodologies in the field of NLP. However, when the
+distribution between the source task and target task drifts, \textit{e.g.},
+conversational environments, these gains tend to be diminished. This article
+proposes a sequence of pre-training steps (a curriculum) guided by "data
+hacking" and grammar analysis that allows further gradual adaptation between
+pre-training distributions. In our experiments, we acquire a considerable
+improvement from our method compared to other known pre-training approaches for
+the MultiWoZ task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XNLP: An Interactive Demonstration System for Universal Structured NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Fei, Meishan Zhang, Min Zhang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structured Natural Language Processing (XNLP) is an important subset of NLP
+that entails understanding the underlying semantic or syntactic structure of
+texts, which serves as a foundational component for many downstream
+applications. Despite certain recent efforts to explore universal solutions for
+specific categories of XNLP tasks, a comprehensive and effective approach for
+unifying all XNLP tasks long remains underdeveloped. In the meanwhile, while
+XNLP demonstration systems are vital for researchers exploring various XNLP
+tasks, existing platforms can be limited to, e.g., supporting few XNLP tasks,
+lacking interactivity and universalness. To this end, we propose an advanced
+XNLP demonstration platform, where we propose leveraging LLM to achieve
+universal XNLP, with one model for all with high generalizability. Overall, our
+system advances in multiple aspects, including universal XNLP modeling, high
+performance, interpretability, scalability, and interactivity, providing a
+unified platform for exploring diverse XNLP tasks in the community. XNLP is
+online: https://xnlp.haofei.vip
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Capability of Large Language Models to Measure Psychiatric
+  Functioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac R. Galatzer-Levy, Daniel McDuff, Vivek Natarajan, Alan Karthikesalingam, Matteo Malgaroli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current work investigates the capability of Large language models (LLMs)
+that are explicitly trained on large corpuses of medical knowledge (Med-PaLM 2)
+to predict psychiatric functioning from patient interviews and clinical
+descriptions without being trained to do so. To assess this, n = 145 depression
+and n =115 PTSD assessments and n = 46 clinical case studies across high
+prevalence/high comorbidity disorders (Depressive, Anxiety, Psychotic, trauma
+and stress, Addictive disorders) were analyzed using prompts to extract
+estimated clinical scores and diagnoses. Results demonstrate that Med-PaLM 2 is
+capable of assessing psychiatric functioning across a range of psychiatric
+conditions with the strongest performance being the prediction of depression
+scores based on standardized assessments (Accuracy range= 0.80 - 0.84) which
+were statistically indistinguishable from human clinical raters t(1,144) =
+1.20; p = 0.23. Results show the potential for general clinical language models
+to flexibly predict psychiatric risk based on free descriptions of functioning
+from both patients and clinicians.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Many-to-Many Spoken Language Translation via Unified Speech and Text
+  Representation Learning with Unit-to-Unit Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Jeongsoo Choi, Dahun Kim, Yong Man Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a method to learn unified representations of
+multilingual speech and text with a single model, especially focusing on the
+purpose of speech synthesis. We represent multilingual speech audio with speech
+units, the quantized representations of speech features encoded from a
+self-supervised speech model. Therefore, we can focus on their linguistic
+content by treating the audio as pseudo text and can build a unified
+representation of speech and text. Then, we propose to train an encoder-decoder
+structured model with a Unit-to-Unit Translation (UTUT) objective on
+multilingual data. Specifically, by conditioning the encoder with the source
+language token and the decoder with the target language token, the model is
+optimized to translate the spoken language into that of the target language, in
+a many-to-many language translation setting. Therefore, the model can build the
+knowledge of how spoken languages are comprehended and how to relate them to
+different languages. A single pre-trained model with UTUT can be employed for
+diverse multilingual speech- and text-related tasks, such as Speech-to-Speech
+Translation (STS), multilingual Text-to-Speech Synthesis (TTS), and
+Text-to-Speech Translation (TTST). By conducting comprehensive experiments
+encompassing various languages, we validate the efficacy of the proposed method
+across diverse multilingual tasks. Moreover, we show UTUT can perform
+many-to-many language STS, which has not been previously explored in the
+literature. Samples are available on https://choijeongsoo.github.io/utut.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Relationship on Learning Mathematical Reasoning with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Yuan, Hongyi Yuan, Chengpeng Li, Guanting Dong, Chuanqi Tan, Chang Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mathematical reasoning is a challenging task for large language models
+(LLMs), while the scaling relationship of it with respect to LLM capacity is
+under-explored. In this paper, we investigate how the pre-training loss,
+supervised data amount, and augmented data amount influence the reasoning
+performances of a supervised LLM. We find that pre-training loss is a better
+indicator of the model's performance than the model's parameter count. We apply
+supervised fine-tuning (SFT) with different amounts of supervised data and
+empirically find a log-linear relation between data amount and model
+performance, and we find better models improve less with enlarged supervised
+datasets. To augment more data samples for improving model performances without
+any human effort, we propose to apply Rejection sampling Fine-Tuning (RFT). RFT
+uses supervised models to generate and collect correct reasoning paths as
+augmented fine-tuning datasets. We find with augmented samples containing more
+distinct reasoning paths, RFT improves mathematical reasoning performance more
+for LLMs. We also find RFT brings more improvement for less performant LLMs.
+Furthermore, we combine rejection samples from multiple models which push
+LLaMA-7B to an accuracy of 49.3% and outperforms the supervised fine-tuning
+(SFT) accuracy of 35.9% significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Working in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lexicon and Rule-based Word Lemmatization Approach for the Somali
+  Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shafie Abdi Mohamed, Muhidin Abdullahi Mohamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lemmatization is a Natural Language Processing (NLP) technique used to
+normalize text by changing morphological derivations of words to their root
+forms. It is used as a core pre-processing step in many NLP tasks including
+text indexing, information retrieval, and machine learning for NLP, among
+others. This paper pioneers the development of text lemmatization for the
+Somali language, a low-resource language with very limited or no prior
+effective adoption of NLP methods and datasets. We especially develop a lexicon
+and rule-based lemmatizer for Somali text, which is a starting point for a
+full-fledged Somali lemmatization system for various NLP tasks. With
+consideration of the language morphological rules, we have developed an initial
+lexicon of 1247 root words and 7173 derivationally related terms enriched with
+rules for lemmatizing words not present in the lexicon. We have tested the
+algorithm on 120 documents of various lengths including news articles, social
+media posts, and text messages. Our initial results demonstrate that the
+algorithm achieves an accuracy of 57\% for relatively long documents (e.g. full
+news articles), 60.57\% for news article extracts, and high accuracy of 95.87\%
+for short texts such as social media messages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Correction Remain An Problem For Large Language Models? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaowu Zhang, Xiaotian Zhang, Cheng Yang, Hang Yan, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models, such as GPT, continue to advance the capabilities
+of natural language processing (NLP), the question arises: does the problem of
+correction still persist? This paper investigates the role of correction in the
+context of large language models by conducting two experiments. The first
+experiment focuses on correction as a standalone task, employing few-shot
+learning techniques with GPT-like models for error correction. The second
+experiment explores the notion of correction as a preparatory task for other
+NLP tasks, examining whether large language models can tolerate and perform
+adequately on texts containing certain levels of noise or errors. By addressing
+these experiments, we aim to shed light on the significance of correction in
+the era of large language models and its implications for various NLP
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supply chain emission estimation using large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Jain, Manikandan Padmanaban, Jagabondhu Hazra, Shantanu Godbole, Kommy Weldemariam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large enterprises face a crucial imperative to achieve the Sustainable
+Development Goals (SDGs), especially goal 13, which focuses on combating
+climate change and its impacts. To mitigate the effects of climate change,
+reducing enterprise Scope 3 (supply chain emissions) is vital, as it accounts
+for more than 90\% of total emission inventories. However, tracking Scope 3
+emissions proves challenging, as data must be collected from thousands of
+upstream and downstream suppliers.To address the above mentioned challenges, we
+propose a first-of-a-kind framework that uses domain-adapted NLP foundation
+models to estimate Scope 3 emissions, by utilizing financial transactions as a
+proxy for purchased goods and services. We compared the performance of the
+proposed framework with the state-of-art text classification models such as
+TF-IDF, word2Vec, and Zero shot learning. Our results show that the
+domain-adapted foundation model outperforms state-of-the-art text mining
+techniques and performs as well as a subject matter expert (SME). The proposed
+framework could accelerate the Scope 3 estimation at Enterprise scale and will
+help to take appropriate climate actions to achieve SDG 13.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ambient Adventures: Teaching Chat<span class="highlight-title">GPT</span> on Developing Complex Stories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01734v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01734v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zexin Chen, Eric Zhou, Kenneth Eaton, Xiangyu Peng, Mark Riedl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imaginative play is an area of creativity that could allow robots to engage
+with the world around them in a much more personified way. Imaginary play can
+be seen as taking real objects and locations and using them as imaginary
+objects and locations in virtual scenarios. We adopted the story generation
+capability of large language models (LLMs) to obtain the stories used for
+imaginary play with human-written prompts. Those generated stories will be
+simplified and mapped into action sequences that can guide the agent in
+imaginary play. To evaluate whether the agent can successfully finish the
+imaginary play, we also designed a text adventure game to simulate a house as
+the playground for the agent to interact.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Large Language Models for Complex Structured Medical Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        V. K. Cody Bumgardner, Aaron Mullen, Sam Armstrong, Caylin Hickey, Jeff Talbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces an approach that combines the language reasoning
+capabilities of large language models (LLMs) with the benefits of local
+training to tackle complex, domain-specific tasks. Specifically, the authors
+demonstrate their approach by extracting structured condition codes from
+pathology reports. The proposed approach utilizes local LLMs, which can be
+fine-tuned to respond to specific generative instructions and provide
+structured outputs. The authors collected a dataset of over 150k uncurated
+surgical pathology reports, containing gross descriptions, final diagnoses, and
+condition codes. They trained different model architectures, including LLaMA,
+BERT and LongFormer and evaluated their performance. The results show that the
+LLaMA-based models significantly outperform BERT-style models across all
+evaluated metrics, even with extremely reduced precision. The LLaMA models
+performed especially well with large datasets, demonstrating their ability to
+handle complex, multi-label tasks. Overall, this work presents an effective
+approach for utilizing LLMs to perform domain-specific tasks using accessible
+hardware, with potential applications in the medical domain, where complex data
+extraction and classification are required.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, Preprint of an article submitted for consideration in
+  Pacific Symposium on Biocomputing \c{opyright} 2024 copyright World
+  Scientific Publishing Company https://www.worldscientific.com/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Baby's CoThought: Leveraging Large Language Models for Enhanced
+  Reasoning in Compact Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01684v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01684v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyu Zhang, Han Yang, Bolei Ma, David Rügamer, Ercong Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate remarkable performance on a variety
+of Natural Language Understanding (NLU) tasks, primarily due to their
+in-context learning ability. This ability is utilized in our proposed
+"CoThought" pipeline, which efficiently trains smaller "baby" language models
+(BabyLMs) by leveraging the Chain of Thought (CoT) prompting of LLMs. Our
+pipeline restructures a dataset of less than 100M in size using GPT-3.5-turbo,
+transforming it into task-oriented, human-readable texts that are comparable to
+the school texts for language learners. The BabyLM is then pretrained on this
+restructured dataset in a RoBERTa (Liu et al., 2019) fashion. In evaluations
+across 4 benchmarks, our BabyLM outperforms the RoBERTa-base in 10 linguistic,
+NLU, and question answering tasks by more than 3 points, showing superior
+ability to extract contextual information. These results suggest that compact
+LMs pretrained on small, LLM-restructured data can better understand tasks and
+achieve improved performance. The code for data processing and model training
+is available at: https://github.com/oooranz/Baby-CoThought.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NBIAS: A Natural Language Processing Framework for Bias Identification
+  in Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Razaa, Muskan Garg, Deepak John Reji, Syed Raza Bashir, Chen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias in textual data can lead to skewed interpretations and outcomes when the
+data is used. These biases could perpetuate stereotypes, discrimination, or
+other forms of unfair treatment. An algorithm trained on biased data ends up
+making decisions that disproportionately impact a certain group of people.
+Therefore, it is crucial to detect and remove these biases to ensure the fair
+and ethical use of data. To this end, we develop a comprehensive and robust
+framework \textsc{Nbias} that consists of a data layer, corpus contruction,
+model development layer and an evaluation layer. The dataset is constructed by
+collecting diverse data from various fields, including social media,
+healthcare, and job hiring portals. As such, we applied a transformer-based
+token classification model that is able to identify bias words/ phrases through
+a unique named entity. In the assessment procedure, we incorporate a blend of
+quantitative and qualitative evaluations to gauge the effectiveness of our
+models. We achieve accuracy improvements ranging from 1% to 8% compared to
+baselines. We are also able to generate a robust understanding of the model
+functioning, capturing not only numerical data but also the quality and
+intricacies of its performance. The proposed approach is applicable to a
+variety of biases and contributes to the fair and ethical use of textual data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Chat<span class="highlight-title">GPT</span> text-mining of clinical records for obesity
+  monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivo S. Fins, Heather Davies, Sean Farrell, Jose R. Torres, Gina Pinchbeck, Alan D. Radford, Peter-John Noble
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Veterinary clinical narratives remain a largely untapped resource
+for addressing complex diseases. Here we compare the ability of a large
+language model (ChatGPT) and a previously developed regular expression (RegexT)
+to identify overweight body condition scores (BCS) in veterinary narratives.
+Methods: BCS values were extracted from 4,415 anonymised clinical narratives
+using either RegexT or by appending the narrative to a prompt sent to ChatGPT
+coercing the model to return the BCS information. Data were manually reviewed
+for comparison. Results: The precision of RegexT was higher (100%, 95% CI
+94.81-100%) than the ChatGPT (89.3%; 95% CI82.75-93.64%). However, the recall
+of ChatGPT (100%. 95% CI 96.18-100%) was considerably higher than that of
+RegexT (72.6%, 95% CI 63.92-79.94%). Limitations: Subtle prompt engineering is
+needed to improve ChatGPT output. Conclusions: Large language models create
+diverse opportunities and, whilst complex, present an intuitive interface to
+information but require careful implementation to avoid unpredictable errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supplementary Material: The data that support the findings of this
+  study are available in the ancillary files of this submission. 5 pages, 2
+  figures (textboxes)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Holy Grail 2.0: From Natural Language to Constraint Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01589v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01589v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimos Tsouros, Hélène Verhaeghe, Serdar Kadıoğlu, Tias Guns
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Twenty-seven years ago, E. Freuder highlighted that "Constraint programming
+represents one of the closest approaches computer science has yet made to the
+Holy Grail of programming: the user states the problem, the computer solves
+it". Nowadays, CP users have great modeling tools available (like Minizinc and
+CPMpy), allowing them to formulate the problem and then let a solver do the
+rest of the job, getting closer to the stated goal. However, this still
+requires the CP user to know the formalism and respect it. Another significant
+challenge lies in the expertise required to effectively model combinatorial
+problems. All this limits the wider adoption of CP. In this position paper, we
+investigate a possible approach to leverage pre-trained Large Language Models
+to extract models from textual problem descriptions. More specifically, we take
+inspiration from the Natural Language Processing for Optimization (NL4OPT)
+challenge and present early results with a decomposition-based prompting
+approach to GPT Models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InterAct: Exploring the Potentials of Chat<span class="highlight-title">GPT</span> as a Cooperative Agent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-Lin Chen, Cheng-Shang Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper delves into the integration of OpenAI's ChatGPT into
+embodied agent systems, evaluating its influence on interactive decision-making
+benchmark. Drawing a parallel to the concept of people assuming roles according
+to their unique strengths, we introduce InterAct. In this approach, we feed
+ChatGPT with varied prompts, assigning it a numerous roles like a checker and a
+sorter, then integrating them with the original language model. Our research
+shows a remarkable success rate of 98% in AlfWorld, which consists of 6
+different tasks in a simulated household environment, emphasizing the
+significance of proficient prompt engineering. The results highlight ChatGPT's
+competence in comprehending and performing intricate tasks effectively in
+real-world settings, thus paving the way for further advancements in task
+planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Neurons in <span class="highlight-title">Pretrain</span>ed Text-Only <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Schwettmann, Neil Chowdhury, Antonio Torralba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models demonstrate remarkable capacity to generalize representations
+learned in one modality to downstream tasks in other modalities. Can we trace
+this ability to individual neurons? We study the case where a frozen text
+transformer is augmented with vision using a self-supervised visual encoder and
+a single linear projection learned on an image-to-text task. Outputs of the
+projection layer are not immediately decodable into language describing image
+content; instead, we find that translation between modalities occurs deeper
+within the transformer. We introduce a procedure for identifying "multimodal
+neurons" that convert visual representations into corresponding text, and
+decoding the concepts they inject into the model's residual stream. In a series
+of experiments, we show that multimodal neurons operate on specific visual
+concepts across inputs, and have a systematic causal effect on image
+captioning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing scalable strategies for generating numerical perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hancheng Cao, Sofia Eleni Spatharioti, Daniel G. Goldstein, Jake M. Hofman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerical perspectives help people understand extreme and unfamiliar numbers
+(e.g., \$330 billion is about \$1,000 per person in the United States). While
+research shows perspectives to be helpful, generating them at scale is
+challenging both because it is difficult to identify what makes some analogies
+more helpful than others, and because what is most helpful can vary based on
+the context in which a given number appears. Here we present and compare three
+policies for large-scale perspective generation: a rule-based approach, a
+crowdsourced system, and a model that uses Wikipedia data and semantic
+similarity (via BERT embeddings) to generate context-specific perspectives. We
+find that the combination of these three approaches dominates any single
+method, with different approaches excelling in different settings and users
+displaying heterogeneous preferences across approaches. We conclude by
+discussing our deployment of perspectives in a widely-used online word
+processor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Model Displays Emergent Ability to Interpret Novel
+  Literary Metaphors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Ichien, Dušan Stamenković, Keith J. Holyoak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the performance of large language models (LLMs) have
+sparked debate over whether, given sufficient training, high-level human
+abilities emerge in such generic forms of artificial intelligence (AI). Despite
+the exceptional performance of LLMs on a wide range of tasks involving natural
+language processing and reasoning, there has been sharp disagreement as to
+whether their abilities extend to more creative human abilities. A core example
+is the ability to interpret novel metaphors. Given the enormous and non-curated
+text corpora used to train LLMs, a serious obstacle to designing tests is the
+requirement of finding novel yet high-quality metaphors that are unlikely to
+have been included in the training data. Here we assessed the ability of GPT-4,
+a state-of-the-art large language model, to provide natural-language
+interpretations of novel literary metaphors drawn from Serbian poetry and
+translated into English. Despite exhibiting no signs of having been exposed to
+these metaphors previously, the AI system consistently produced detailed and
+incisive interpretations. Human judge - blind to the fact that an AI model was
+involved - rated metaphor interpretations generated by GPT-4 as superior to
+those provided by a group of college students. In interpreting reversed
+metaphors, GPT-4, as well as humans, exhibited signs of sensitivity to the
+Gricean cooperative principle. These results indicate that LLMs such as GPT-4
+have acquired an emergent ability to interpret complex novel metaphors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Reinforcement Learning for Communication Strategies in a
+  Task-Initiative Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01479v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01479v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baber Khalid, Matthew Stone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many conversational domains require the system to present nuanced information
+to users. Such systems must follow up what they say to address clarification
+questions and repair misunderstandings. In this work, we explore this
+interactive strategy in a referential communication task. Using simulation, we
+analyze the communication trade-offs between initial presentation and
+subsequent followup as a function of user clarification strategy, and compare
+the performance of several baseline strategies to policies derived by
+reinforcement learning. We find surprising advantages to coherence-based
+representations of dialogue strategy, which bring minimal data requirements,
+explainable choices, and strong audit capabilities, but incur little loss in
+predicted outcomes across a wide range of user models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Peer-reviewed and Published at IWSDS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causality Guided Disentanglement for Cross-Platform Hate Speech
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paras Sheth, Tharindu Kumarage, Raha Moraffah, Aman Chadha, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms, despite their value in promoting open discourse, are
+often exploited to spread harmful content. Current deep learning and natural
+language processing models used for detecting this harmful content overly rely
+on domain-specific terms affecting their capabilities to adapt to generalizable
+hate speech detection. This is because they tend to focus too narrowly on
+particular linguistic signals or the use of certain categories of words.
+Another significant challenge arises when platforms lack high-quality annotated
+data for training, leading to a need for cross-platform models that can adapt
+to different distribution shifts. Our research introduces a cross-platform hate
+speech detection model capable of being trained on one platform's data and
+generalizing to multiple unseen platforms. To achieve good generalizability
+across platforms, one way is to disentangle the input representations into
+invariant and platform-dependent features. We also argue that learning causal
+relationships, which remain constant across diverse environments, can
+significantly aid in understanding invariant representations in hate speech. By
+disentangling input into platform-dependent features (useful for predicting
+hate targets) and platform-independent features (used to predict the presence
+of hate), we learn invariant representations resistant to distribution shifts.
+These features are then used to predict hate speech across unseen platforms.
+Our extensive experiments across four platforms highlight our model's enhanced
+efficacy compared to existing state-of-the-art methods in detecting generalized
+hate speech.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seasonality Based Reranking of E-commerce Autocomplete Using Natural
+  Language Queries <span class="chip">KDD'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Verma, Shan Zhong, Xiaoyu Liu, Adithya Rajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query autocomplete (QAC) also known as typeahead, suggests list of complete
+queries as user types prefix in the search box. It is one of the key features
+of modern search engines specially in e-commerce. One of the goals of typeahead
+is to suggest relevant queries to users which are seasonally important. In this
+paper we propose a neural network based natural language processing (NLP)
+algorithm to incorporate seasonality as a signal and present end to end
+evaluation of the QAC ranking model. Incorporating seasonality into
+autocomplete ranking model can improve autocomplete relevance and business
+metric.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 6th Workshop on e-Commerce and NLP (ECNLP 6), KDD'23,
+  Long Beach, CA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Unequal Opportunities of Large Language Models: Revealing
+  Demographic Bias through Job Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abel Salinas, Parth Vipul Shah, Yuzhong Huang, Robert McCormack, Fred Morstatter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have seen widespread deployment in various
+real-world applications. Understanding these biases is crucial to comprehend
+the potential downstream consequences when using LLMs to make decisions,
+particularly for historically disadvantaged groups. In this work, we propose a
+simple method for analyzing and comparing demographic bias in LLMs, through the
+lens of job recommendations. We demonstrate the effectiveness of our method by
+measuring intersectional biases within ChatGPT and LLaMA, two cutting-edge
+LLMs. Our experiments primarily focus on uncovering gender identity and
+nationality bias; however, our method can be extended to examine biases
+associated with any intersection of demographic identities. We identify
+distinct biases in both models toward various demographic identities, such as
+both models consistently suggesting low-paying jobs for Mexican workers or
+preferring to recommend secretarial roles to women. Our study highlights the
+importance of measuring the bias of LLMs in downstream applications to
+understand the potential for harm and inequitable outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EAAMO 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Sentiment Analysis: A Resource-Aware Evaluation of Feature
+  Extraction Techniques, Ensembling, and Deep Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahammed Kamruzzaman, Gene Louis Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While reaching for NLP systems that maximize accuracy, other important
+metrics of system performance are often overlooked. Prior models are easily
+forgotten despite their possible suitability in settings where large computing
+resources are unavailable or relatively more costly. In this paper, we perform
+a broad comparative evaluation of document-level sentiment analysis models with
+a focus on resource costs that are important for the feasibility of model
+deployment and general climate consciousness. Our experiments consider
+different feature extraction techniques, the effect of ensembling,
+task-specific deep learning modeling, and domain-independent large language
+models (LLMs). We find that while a fine-tuned LLM achieves the best accuracy,
+some alternate configurations provide huge (up to 24, 283 *) resource savings
+for a marginal (<1%) loss in accuracy. Furthermore, we find that for smaller
+datasets, the differences in accuracy shrink while the difference in resource
+consumption grows further.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Baby Llama: knowledge distillation from an ensemble of teachers trained
+  on a small <span class="highlight-title">dataset</span> with no performance penalty <span class="chip">CoNLL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inar Timiryasov, Jean-Loup Tastet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present our proposed solution to the BabyLM challenge [arXiv:2301.11796],
+whose goal was to improve the sample efficiency of language models. We trained
+an ensemble consisting of a GPT-2 and small LLaMA models on the
+developmentally-plausible, 10M-word BabyLM dataset, then distilled it into a
+small, 58M-parameter LLaMA model, which exceeds in performance both of its
+teachers as well as a similar model trained without distillation. This suggests
+that distillation can not only retain the full performance of the teacher model
+when the latter is trained on a sufficiently small dataset; it can exceed it,
+and lead to significantly better performance than direct training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures, 4 tables, submitted to the BabyLM Challenge
+  (CoNLL--CMCL 2023 Shared Task)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Representation Learning for Automatic Speech Recognition <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad V Rames, Gopinath Chennupati, Milind Rao, Anit Kumar Sahu, Ariya Rastrow, Jasha Droppo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge
+devices to learn collaboratively without sharing data. Edge devices like Alexa
+and Siri are prospective sources of unlabeled audio data that can be tapped to
+learn robust audio representations. In this work, we bring Self-supervised
+Learning (SSL) and FL together to learn representations for Automatic Speech
+Recognition respecting data privacy constraints. We use the speaker and chapter
+information in the unlabeled speech dataset, Libri-Light, to simulate non-IID
+speaker-siloed data distributions and pre-train an LSTM encoder with the
+Contrastive Predictive Coding framework with FedSGD. We show that the
+pre-trained ASR encoder in FL performs as well as a centrally pre-trained model
+and produces an improvement of 12-15% (WER) compared to no pre-training. We
+further adapt the federated pre-trained models to a new language, French, and
+show a 20% (WER) improvement over no pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy
+  in Speech Communication, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bengali Fake <span class="highlight-title">Review</span>s: A Benchmark <span class="highlight-title">Dataset</span> and Detection System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        G. M. Shahariar, Md. Tanvir Rouf Shawon, Faisal Muhammad Shah, Mohammad Shafiul Alam, Md. Shahriar Mahbub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of fake reviews on various online platforms has created a
+major concern for both consumers and businesses. Such reviews can deceive
+customers and cause damage to the reputation of products or services, making it
+crucial to identify them. Although the detection of fake reviews has been
+extensively studied in English language, detecting fake reviews in non-English
+languages such as Bengali is still a relatively unexplored research area. This
+paper introduces the Bengali Fake Review Detection (BFRD) dataset, the first
+publicly available dataset for identifying fake reviews in Bengali. The dataset
+consists of 7710 non-fake and 1339 fake food-related reviews collected from
+social media posts. To convert non-Bengali words in a review, a unique pipeline
+has been proposed that translates English words to their corresponding Bengali
+meaning and also back transliterates Romanized Bengali to Bengali. We have
+conducted rigorous experimentation using multiple deep learning and pre-trained
+transformer language models to develop a reliable detection system. Finally, we
+propose a weighted ensemble model that combines four pre-trained transformers:
+BanglaBERT, BanglaBERT Base, BanglaBERT Large, and BanglaBERT Generator .
+According to the experiment results, the proposed ensemble model obtained a
+weighted F1-score of 0.9843 on 13390 reviews, including 1339 actual fake
+reviews and 5356 augmented fake reviews generated with the nlpaug library. The
+remaining 6695 reviews were randomly selected from the 7710 non-fake instances.
+The model achieved a 0.9558 weighted F1-score when the fake reviews were
+augmented using the bnaug library.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain specificity and data efficiency in typo tolerant spell checkers:
+  the case of search in online marketplaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayananda Ubrangala, Juhi Sharma, Ravi Prasad Kondapalli, Kiran R, Amit Agarwala, Laurent Boué
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typographical errors are a major source of frustration for visitors of online
+marketplaces. Because of the domain-specific nature of these marketplaces and
+the very short queries users tend to search for, traditional spell cheking
+solutions do not perform well in correcting typos. We present a data
+augmentation method to address the lack of annotated typo data and train a
+recurrent neural network to learn context-limited domain-specific embeddings.
+Those embeddings are deployed in a real-time inferencing API for the Microsoft
+AppSource marketplace to find the closest match between a misspelled user query
+and the available product names. Our data efficient solution shows that
+controlled high quality synthetic data may be a powerful tool especially
+considering the current climate of large language models which rely on
+prohibitively huge and often uncontrolled datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are LLMs All You Need for Task-Oriented Dialogue? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06556v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06556v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vojtěch Hudeček, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instructions-tuned Large Language Models (LLMs) gained recently huge
+popularity thanks to their ability to interact with users through conversation.
+In this work we aim to evaluate their ability to complete multi-turn tasks and
+interact with external databases in the context of established task-oriented
+dialogue benchmarks. We show that for explicit belief state tracking, LLMs
+underperform compared to specialized task-specific models. Nevertheless, they
+show ability to guide the dialogue to successful ending if given correct slot
+values. Furthermore this ability improves with access to true belief state
+distribution or in-domain examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGDial 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inductive reasoning in humans and large language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06548v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06548v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon J. Han, Keith Ransom, Andrew Perfors, Charles Kemp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impressive recent performance of large language models has led many to
+wonder to what extent they can serve as models of general intelligence or are
+similar to human cognition. We address this issue by applying GPT-3.5 and GPT-4
+to a classic problem in human inductive reasoning known as property induction.
+Over two experiments, we elicit human judgments on a range of property
+induction tasks spanning multiple domains. Although GPT-3.5 struggles to
+capture many aspects of human behaviour, GPT-4 is much more successful: for the
+most part, its performance qualitatively matches that of humans, and the only
+notable exception is its failure to capture the phenomenon of premise
+non-monotonicity. Our work demonstrates that property induction allows for
+interesting comparisons between human and machine intelligence and provides two
+large datasets that can serve as benchmarks for future work in this vein.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>61 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An automatically discovered chain-of-thought <span class="highlight-title">prompt</span> generalizes to novel
+  models and <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02897v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02897v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Hebenstreit, Robert Praas, Louis P Kiesewetter, Matthias Samwald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emergent chain-of-thought (CoT) reasoning capabilities promise to improve
+performance and explainability of large language models (LLMs). However,
+uncertainties remain about how reasoning strategies formulated for previous
+model generations generalize to new model generations and different datasets.
+In this small-scale study, we compare different reasoning strategies induced by
+zero-shot prompting across six recently released LLMs (davinci-002,
+davinci-003, GPT-3.5-turbo, GPT-4, Flan-T5-xxl and Cohere command-xlarge) on a
+mixture of six question-answering datasets, including datasets from scientific
+and medical domains. Our findings demonstrate that while some variations in
+effectiveness occur, gains from CoT reasoning strategies remain robust across
+different models and datasets. GPT-4 has the most benefit from current
+state-of-the-art reasoning strategies and exhibits the best performance by
+applying a prompt previously discovered through automated discovery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEV<span class="highlight-title">Bert</span>: Multimodal Map <span class="highlight-title">Pre-train</span>ing for Language-guided Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong An, Yuankai Qi, Yangguang Li, Yan Huang, Liang Wang, Tieniu Tan, Jing Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-training has shown promising results on the
+vision-and-language navigation (VLN) task. However, most existing pre-training
+methods employ discrete panoramas to learn visual-textual associations. This
+requires the model to implicitly correlate incomplete, duplicate observations
+within the panoramas, which may impair an agent's spatial understanding. Thus,
+we propose a new map-based pre-training paradigm that is spatial-aware for use
+in VLN. Concretely, we build a local metric map to explicitly aggregate
+incomplete observations and remove duplicates, while modeling navigation
+dependency in a global topological map. This hybrid design can balance the
+demand of VLN for both short-term reasoning and long-term planning. Then, based
+on the hybrid map, we devise a pre-training framework to learn a multimodal map
+representation, which enhances spatial-aware cross-modal reasoning thereby
+facilitating the language-guided navigation goal. Extensive experiments
+demonstrate the effectiveness of the map-based pre-training route for VLN, and
+the proposed method achieves state-of-the-art on four VLN benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, project page: https://github.com/MarSaKi/VLN-BEVBert</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gzip versus bag-of-words for text classification with KNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15002v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15002v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juri Opitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of compression distance in KNN-based text classification
+('gzip') has recently garnered lots of attention. In this note we show that
+simpler means can also be effective, and compression may not be needed. Indeed,
+a 'bag-of-words' matching can achieve similar or better results, and is more
+efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>improved writing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable In-the-Wild Video Quality Assessment: A Database and
+  a Language-<span class="highlight-title">Prompt</span>ed Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoning Wu, Erli Zhang, Liang Liao, Chaofeng Chen, Jingwen Hou, Annan Wang, Wenxiu Sun, Qiong Yan, Weisi Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of in-the-wild videos has greatly expanded the Video
+Quality Assessment (VQA) problem. Unlike early definitions that usually focus
+on limited distortion types, VQA on in-the-wild videos is especially
+challenging as it could be affected by complicated factors, including various
+distortions and diverse contents. Though subjective studies have collected
+overall quality scores for these videos, how the abstract quality scores relate
+with specific factors is still obscure, hindering VQA methods from more
+concrete quality evaluations (e.g. sharpness of a video). To solve this
+problem, we collect over two million opinions on 4,543 in-the-wild videos on 13
+dimensions of quality-related factors, including in-capture authentic
+distortions (e.g. motion blur, noise, flicker), errors introduced by
+compression and transmission, and higher-level experiences on semantic contents
+and aesthetic issues (e.g. composition, camera trajectory), to establish the
+multi-dimensional Maxwell database. Specifically, we ask the subjects to label
+among a positive, a negative, and a neutral choice for each dimension. These
+explanation-level opinions allow us to measure the relationships between
+specific quality factors and abstract subjective quality ratings, and to
+benchmark different categories of VQA algorithms on each dimension, so as to
+more comprehensively analyze their strengths and weaknesses. Furthermore, we
+propose the MaxVQA, a language-prompted VQA approach that modifies
+vision-language foundation model CLIP to better capture important quality
+issues as observed in our analyses. The MaxVQA can jointly evaluate various
+specific quality factors and final quality scores with state-of-the-art
+accuracy on all dimensions, and superb generalization ability on existing
+datasets. Code and data available at https://github.com/VQAssessment/MaxVQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 31st ACM International Conference on Multimedia
+  (MM '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models:
+  A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Chengyu Wang, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and large language models have emerged as leading-edge
+generative models and have sparked a revolutionary impact on various aspects of
+human life. However, the practical implementation of these models has also
+exposed inherent risks, highlighting their dual nature and raising concerns
+regarding their trustworthiness. Despite the abundance of literature on this
+subject, a comprehensive survey specifically delving into the intersection of
+large-scale generative models and their trustworthiness remains largely absent.
+To bridge this gap, This paper investigates both the long-standing and emerging
+threats associated with these models across four fundamental dimensions:
+privacy, security, fairness, and responsibility. In this way, we construct an
+extensive map outlining the trustworthiness of these models, while also
+providing practical recommendations and identifying future directions. These
+efforts are crucial for promoting the trustworthy deployment of these models,
+ultimately benefiting society as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Knows Which Words Will Appear in Next Year's Korean CSAT 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byunghyun Ban, Jejong Lee, Hyeonmok Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A text-mining-based word class categorization method and LSTM-based
+vocabulary pattern prediction method are introduced in this paper. A
+preprocessing method based on simple text appearance frequency analysis is
+first described. This method was developed as a data screening tool but showed
+4.35 ~ 6.21 times higher than previous works. An LSTM deep learning method is
+also suggested for vocabulary appearance pattern prediction method. AI performs
+a regression with various size of data window of previous exams to predict the
+probabilities of word appearance in the next exam. Predicted values of AI over
+various data windows are processed into a single score as a weighted sum, which
+we call an "AI-Score", which represents the probability of word appearance in
+next year's exam. Suggested method showed 100% accuracy at the range 100-score
+area and showed only 1.7% error of prediction in the section where the scores
+were over 60 points. All source codes are freely available at the authors' Git
+Hub repository. (https://github.com/needleworm/bigdata_voca)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update additional experiment result</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LMExplainer: a Knowledge-Enhanced Explainer for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16537v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16537v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Chen, Ambuj K Singh, Misha Sra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as GPT-4 are very powerful and can process
+different kinds of natural language processing (NLP) tasks. However, it can be
+difficult to interpret the results due to the multi-layer nonlinear model
+structure and millions of parameters. A lack of clarity and understanding of
+how the language models (LMs) work can make them unreliable, difficult to
+trust, and potentially dangerous for use in real-world scenarios. Most recent
+works exploit attention weights to provide explanations for LM predictions.
+However, pure attention-based explanations are unable to support the growing
+complexity of LMs, and cannot reason about their decision-making processes. We
+propose LMExplainer, a knowledge-enhanced explainer for LMs that can provide
+human-understandable explanations. We use a knowledge graph (KG) and a graph
+attention neural network to extract the key decision signals of the LM. We
+further explore whether interpretation can also help the AI understand the task
+better. Our experimental results show that LMExplainer outperforms existing
+LM+KG methods on CommonsenseQA and OpenBookQA. We compare the explanation
+results with generated explanation methods and human-annotated results. The
+comparison shows our method can provide more comprehensive and clearer
+explanations. LMExplainer demonstrates the potential to enhance model
+performance and furnish explanations for the LM reasoning process in natural
+language.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 1 figure, 7 tables, and 3 case studies</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mapping Chat<span class="highlight-title">GPT</span> in Mainstream Media to Unravel Jobs and Diversity
+  Challenges: Early Quantitative Insights through Sentiment Analysis and Word
+  Frequency Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maya Karanouh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential growth in user acquisition and popularity of OpenAIs ChatGPT,
+an artificial intelligence(AI) powered chatbot, was accompanied by widespread
+mainstream media coverage. This article presents a quantitative data analysis
+of the early trends and sentiments revealed by conducting text mining and NLP
+methods onto a corpus of 10,902 mainstream news headlines related to the
+subject of ChatGPT and artificial intelligence, from the launch of ChatGPT in
+November 2022 to March 2023. The findings revealed in sentiment analysis,
+ChatGPT and artificial intelligence, were perceived more positively than
+negatively in the mainstream media. In regards to word frequency results, over
+sixty-five percent of the top frequency words were focused on Big Tech issues
+and actors while topics such as jobs, diversity, ethics, copyright, gender and
+women were poorly represented or completely absent and only accounted for six
+percent of the total corpus. This article is a critical analysis into the power
+structures and collusions between Big Tech and Big Media in their hegemonic
+exclusion of diversity and job challenges from mainstream media.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emergent Analogical Reasoning in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09196v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09196v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taylor Webb, Keith J. Holyoak, Hongjing Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advent of large language models has reinvigorated debate over
+whether human cognitive capacities might emerge in such generic models given
+sufficient training data. Of particular interest is the ability of these models
+to reason about novel problems zero-shot, without any direct training. In human
+cognition, this capacity is closely tied to an ability to reason by analogy.
+Here, we performed a direct comparison between human reasoners and a large
+language model (the text-davinci-003 variant of GPT-3) on a range of analogical
+tasks, including a non-visual matrix reasoning task based on the rule structure
+of Raven's Standard Progressive Matrices. We found that GPT-3 displayed a
+surprisingly strong capacity for abstract pattern induction, matching or even
+surpassing human capabilities in most settings; preliminary tests of GPT-4
+indicated even better performance. Our results indicate that large language
+models such as GPT-3 have acquired an emergent ability to find zero-shot
+solutions to a broad range of analogy problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Nature Human Behaviour (2023)
+  https://doi.org/10.1038/s41562-023-01659-w</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">114</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The All-Seeing Project: Towards Panoptic Visual Recognition and
+  Understanding of the Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyun Wang, Min Shi, Qingyun Li, Wenhai Wang, Zhenhang Huang, Linjie Xing, Zhe Chen, Hao Li, Xizhou Zhu, Zhiguo Cao, Yushi Chen, Tong Lu, Jifeng Dai, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the All-Seeing (AS) project: a large-scale data and model for
+recognizing and understanding everything in the open world. Using a scalable
+data engine that incorporates human feedback and efficient models in the loop,
+we create a new dataset (AS-1B) with over 1 billion regions annotated with
+semantic tags, question-answering pairs, and detailed captions. It covers a
+wide range of 3.5 million common and rare concepts in the real world, and has
+132.2 billion tokens that describe the concepts and their attributes.
+Leveraging this new dataset, we develop the All-Seeing model (ASM), a unified
+framework for panoptic visual recognition and understanding. The model is
+trained with open-ended language prompts and locations, which allows it to
+generalize to various vision and language tasks with remarkable zero-shot
+performance, including region-text retrieval, region recognition, captioning,
+and question-answering. We hope that this project can serve as a foundation for
+vision-language artificial general intelligence research. Models and the
+dataset shall be released at https://github.com/OpenGVLab/All-Seeing, and demo
+can be seen at https://huggingface.co/spaces/OpenGVLab/all-seeing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Deformable Convolution for Depth Completion <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinglong Sun, Jean Ponce, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion, which aims to generate high-quality dense depth maps from
+sparse depth maps, has attracted increasing attention in recent years. Previous
+work usually employs RGB images as guidance, and introduces iterative spatial
+propagation to refine estimated coarse depth maps. However, most of the
+propagation refinement methods require several iterations and suffer from a
+fixed receptive field, which may contain irrelevant and useless information
+with very sparse input. In this paper, we address these two challenges
+simultaneously by revisiting the idea of deformable convolution. We propose an
+effective architecture that leverages deformable kernel convolution as a
+single-pass refinement module, and empirically demonstrate its superiority. To
+better understand the function of deformable convolution and exploit it for
+depth completion, we further systematically investigate a variety of
+representative strategies. Our study reveals that, different from prior work,
+deformable convolution needs to be applied on an estimated depth map with a
+relatively high density for better performance. We evaluate our model on the
+large-scale KITTI dataset and achieve state-of-the-art level performance in
+both accuracy and inference speed. Our code is available at
+https://github.com/AlexSunNik/ReDC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and going to appear at IROS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DETR Doesn't Need Multi-Scale or Locality Design <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutong Lin, Yuhui Yuan, Zheng Zhang, Chen Li, Nanning Zheng, Han Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an improved DETR detector that maintains a "plain"
+nature: using a single-scale feature map and global cross-attention
+calculations without specific locality constraints, in contrast to previous
+leading DETR-based detectors that reintroduce architectural inductive biases of
+multi-scale and locality into the decoder. We show that two simple technologies
+are surprisingly effective within a plain design to compensate for the lack of
+multi-scale feature maps and locality constraints. The first is a box-to-pixel
+relative position bias (BoxRPB) term added to the cross-attention formulation,
+which well guides each query to attend to the corresponding object region while
+also providing encoding flexibility. The second is masked image modeling
+(MIM)-based backbone pre-training which helps learn representation with
+fine-grained localization ability and proves crucial for remedying dependencies
+on the multi-scale feature maps. By incorporating these technologies and recent
+advancements in training and problem formation, the improved "plain" DETR
+showed exceptional improvements over the original DETR detector. By leveraging
+the Object365 dataset for pre-training, it achieved 63.9 mAP accuracy using a
+Swin-L backbone, which is highly competitive with state-of-the-art detectors
+which all heavily rely on multi-scale feature maps and region-based feature
+extraction. Code is available at https://github.com/impiga/Plain-DETR .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniSim: A Neural Closed-Loop Sensor Simulator <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01898v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01898v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Yang, Yun Chen, Jingkang Wang, Sivabalan Manivasagam, Wei-Chiu Ma, Anqi Joyce Yang, Raquel Urtasun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rigorously testing autonomy systems is essential for making safe self-driving
+vehicles (SDV) a reality. It requires one to generate safety critical scenarios
+beyond what can be collected safely in the world, as many scenarios happen
+rarely on public roads. To accurately evaluate performance, we need to test the
+SDV on these scenarios in closed-loop, where the SDV and other actors interact
+with each other at each timestep. Previously recorded driving logs provide a
+rich resource to build these new scenarios from, but for closed loop
+evaluation, we need to modify the sensor data based on the new scene
+configuration and the SDV's decisions, as actors might be added or removed and
+the trajectories of existing actors and the SDV will differ from the original
+log. In this paper, we present UniSim, a neural sensor simulator that takes a
+single recorded log captured by a sensor-equipped vehicle and converts it into
+a realistic closed-loop multi-sensor simulation. UniSim builds neural feature
+grids to reconstruct both the static background and dynamic actors in the
+scene, and composites them together to simulate LiDAR and camera data at new
+viewpoints, with actors added or removed and at new placements. To better
+handle extrapolated views, we incorporate learnable priors for dynamic objects,
+and leverage a convolutional network to complete unseen regions. Our
+experiments show UniSim can simulate realistic sensor data with small domain
+gap on downstream tasks. With UniSim, we demonstrate closed-loop evaluation of
+an autonomy system on safety-critical scenarios as if it were in the real
+world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023 Highlight. Project page: https://waabi.ai/research/unisim/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DualCoOp++: Fast and Effective Adaptation to Multi-Label Recognition
+  with Limited Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ping Hu, Ximeng Sun, Stan Sclaroff, Kate Saenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-label image recognition in the low-label regime is a task of great
+challenge and practical significance. Previous works have focused on learning
+the alignment between textual and visual spaces to compensate for limited image
+labels, yet may suffer from reduced accuracy due to the scarcity of
+high-quality multi-label annotations. In this research, we leverage the
+powerful alignment between textual and visual features pretrained with millions
+of auxiliary image-text pairs. We introduce an efficient and effective
+framework called Evidence-guided Dual Context Optimization (DualCoOp++), which
+serves as a unified approach for addressing partial-label and zero-shot
+multi-label recognition. In DualCoOp++ we separately encode evidential,
+positive, and negative contexts for target classes as parametric components of
+the linguistic input (i.e., prompts). The evidential context aims to discover
+all the related visual content for the target class, and serves as guidance to
+aggregate positive and negative contexts from the spatial domain of the image,
+enabling better distinguishment between similar categories. Additionally, we
+introduce a Winner-Take-All module that promotes inter-class interaction during
+training, while avoiding the need for extra parameters and costs. As DualCoOp++
+imposes minimal additional learnable overhead on the pretrained vision-language
+framework, it enables rapid adaptation to multi-label recognition tasks with
+limited annotations and even unseen classes. Experiments on standard
+multi-label recognition benchmarks across two challenging low-label settings
+demonstrate the superior performance of our approach compared to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible. arXiv admin note: substantial text overlap with
+  arXiv:2206.09541</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FROD: Robust Object Detection for Free 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Muhammad,  Awais,  Weiming,  Zhuang,  Lingjuan,  Lyu,  Sung-Ho,  Bae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection is a vital task in computer vision and has become an
+integral component of numerous critical systems. However, state-of-the-art
+object detectors, similar to their classification counterparts, are susceptible
+to small adversarial perturbations that can significantly alter their normal
+behavior. Unlike classification, the robustness of object detectors has not
+been thoroughly explored. In this work, we take the initial step towards
+bridging the gap between the robustness of classification and object detection
+by leveraging adversarially trained classification models. Merely utilizing
+adversarially trained models as backbones for object detection does not result
+in robustness. We propose effective modifications to the classification-based
+backbone to instill robustness in object detection without incurring any
+computational overhead. To further enhance the robustness achieved by the
+proposed modified backbone, we introduce two lightweight components: imitation
+loss and delayed adversarial training. Extensive experiments on the MS-COCO and
+Pascal VOC datasets are conducted to demonstrate the effectiveness of our
+proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reconstructing Three-Dimensional Models of Interacting Humans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihai Fieraru, Mihai Zanfir, Elisabeta Oneata, Alin-Ionut Popa, Vlad Olaru, Cristian Sminchisescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding 3d human interactions is fundamental for fine-grained scene
+analysis and behavioural modeling. However, most of the existing models predict
+incorrect, lifeless 3d estimates, that miss the subtle human contact
+aspects--the essence of the event--and are of little use for detailed
+behavioral understanding. This paper addresses such issues with several
+contributions: (1) we introduce models for interaction signature estimation
+(ISP) encompassing contact detection, segmentation, and 3d contact signature
+prediction; (2) we show how such components can be leveraged to ensure contact
+consistency during 3d reconstruction; (3) we construct several large datasets
+for learning and evaluating 3d contact prediction and reconstruction methods;
+specifically, we introduce CHI3D, a lab-based accurate 3d motion capture
+dataset with 631 sequences containing $2,525$ contact events, $728,664$ ground
+truth 3d poses, as well as FlickrCI3D, a dataset of $11,216$ images, with
+$14,081$ processed pairs of people, and $81,233$ facet-level surface
+correspondences. Finally, (4) we propose methodology for recovering the
+ground-truth pose and shape of interacting people in a controlled setup and (5)
+annotate all 3d interaction motions in CHI3D with textual descriptions. Motion
+data in multiple formats (GHUM and SMPLX parameters, Human3.6m 3d joints) is
+made available for research purposes at \url{https://ci3d.imar.ro}, together
+with an evaluation server and a public benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthesizing Long-Term Human Motions with Diffusion Models via Coherent
+  Sampling <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Yang, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-motion generation has gained increasing attention, but most existing
+methods are limited to generating short-term motions that correspond to a
+single sentence describing a single action. However, when a text stream
+describes a sequence of continuous motions, the generated motions corresponding
+to each sentence may not be coherently linked. Existing long-term motion
+generation methods face two main issues. Firstly, they cannot directly generate
+coherent motions and require additional operations such as interpolation to
+process the generated actions. Secondly, they generate subsequent actions in an
+autoregressive manner without considering the influence of future actions on
+previous ones. To address these issues, we propose a novel approach that
+utilizes a past-conditioned diffusion model with two optional coherent sampling
+methods: Past Inpainting Sampling and Compositional Transition Sampling. Past
+Inpainting Sampling completes subsequent motions by treating previous motions
+as conditions, while Compositional Transition Sampling models the distribution
+of the transition as the composition of two adjacent motions guided by
+different text prompts. Our experimental results demonstrate that our proposed
+method is capable of generating compositional and coherent long-term 3D human
+motions controlled by a user-instructed long text stream. The code is available
+at
+\href{https://github.com/yangzhao1230/PCMDM}{https://github.com/yangzhao1230/PCMDM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is your data alignable? Principled and interpretable alignability
+  testing and integration of single-cell data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rong Ma, Eric D. Sun, David Donoho, James Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-cell data integration can provide a comprehensive molecular view of
+cells, and many algorithms have been developed to remove unwanted technical or
+biological variations and integrate heterogeneous single-cell datasets. Despite
+their wide usage, existing methods suffer from several fundamental limitations.
+In particular, we lack a rigorous statistical test for whether two
+high-dimensional single-cell datasets are alignable (and therefore should even
+be aligned). Moreover, popular methods can substantially distort the data
+during alignment, making the aligned data and downstream analysis difficult to
+interpret. To overcome these limitations, we present a spectral manifold
+alignment and inference (SMAI) framework, which enables principled and
+interpretable alignability testing and structure-preserving integration of
+single-cell data. SMAI provides a statistical test to robustly determine the
+alignability between datasets to avoid misleading inference, and is justified
+by high-dimensional statistical theory. On a diverse range of real and
+simulated benchmark datasets, it outperforms commonly used alignment methods.
+Moreover, we show that SMAI improves various downstream analyses such as
+identification of differentially expressed genes and imputation of single-cell
+spatial transcriptomics, providing further biological insights. SMAI's
+interpretability also enables quantification and a deeper understanding of the
+sources of technical confounders in single-cell data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Neural Networks Fused with Textures for Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asish Bera, Debotosh Bhattacharjee, Mita Nasipuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained image classification (FGIC) is a challenging task in computer
+vision for due to small visual differences among inter-subcategories, but,
+large intra-class variations. Deep learning methods have achieved remarkable
+success in solving FGIC. In this paper, we propose a fusion approach to address
+FGIC by combining global texture with local patch-based information. The first
+pipeline extracts deep features from various fixed-size non-overlapping patches
+and encodes features by sequential modelling using the long short-term memory
+(LSTM). Another path computes image-level textures at multiple scales using the
+local binary patterns (LBP). The advantages of both streams are integrated to
+represent an efficient feature vector for image classification. The method is
+tested on eight datasets representing the human faces, skin lesions, food
+dishes, marine lives, etc. using four standard backbone CNNs. Our method has
+attained better classification accuracy over existing methods with notable
+margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures, 4 tables, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An End-to-end Food Portion Estimation Framework Based on Shape
+  Reconstruction from Monocular Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeman Shao, Gautham Vinod, Jiangpeng He, Fengqing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dietary assessment is a key contributor to monitoring health status. Existing
+self-report methods are tedious and time-consuming with substantial biases and
+errors. Image-based food portion estimation aims to estimate food energy values
+directly from food images, showing great potential for automated dietary
+assessment solutions. Existing image-based methods either use a single-view
+image or incorporate multi-view images and depth information to estimate the
+food energy, which either has limited performance or creates user burdens. In
+this paper, we propose an end-to-end deep learning framework for food energy
+estimation from a monocular image through 3D shape reconstruction. We leverage
+a generative model to reconstruct the voxel representation of the food object
+from the input image to recover the missing 3D information. Our method is
+evaluated on a publicly available food image dataset Nutrition5k, resulting a
+Mean Absolute Error (MAE) of 40.05 kCal and Mean Absolute Percentage Error
+(MAPE) of 11.47% for food energy estimation. Our method uses RGB image as the
+only input at the inference stage and achieves competitive results compared to
+the existing method requiring both RGB and depth information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QUEST: Query Stream for Vehicle-Infrastructure Cooperative Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01804v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01804v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Fan, Haibao Yu, Wenxian Yang, Jirui Yuan, Zaiqing Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative perception can effectively enhance individual perception
+performance by providing additional viewpoint and expanding the sensing field.
+Existing cooperation paradigms are either interpretable (result cooperation) or
+flexible (feature cooperation). In this paper, we propose the concept of query
+cooperation to enable interpretable instance-level flexible feature
+interaction. To specifically explain the concept, we propose a cooperative
+perception framework, termed QUEST, which let query stream flow among agents.
+The cross-agent queries are interacted via fusion for co-aware instances and
+complementation for individual unaware instances. Taking camera-based
+vehicle-infrastructure perception as a typical practical application scene, the
+experimental results on the real-world dataset, DAIR-V2X-Seq, demonstrate the
+effectiveness of QUEST and further reveal the advantage of the query
+cooperation paradigm on transmission flexibility and robustness to packet
+dropout. We hope our work can further facilitate the cross-agent representation
+interaction for better cooperative perception in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point2Mask: Point-supervised Panoptic Segmentation via Optimal Transport <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentong Li, Yuqian Yuan, Song Wang, Jianke Zhu, Jianshu Li, Jian Liu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised image segmentation has recently attracted increasing
+research attentions, aiming to avoid the expensive pixel-wise labeling. In this
+paper, we present an effective method, namely Point2Mask, to achieve
+high-quality panoptic prediction using only a single random point annotation
+per target for training. Specifically, we formulate the panoptic pseudo-mask
+generation as an Optimal Transport (OT) problem, where each ground-truth (gt)
+point label and pixel sample are defined as the label supplier and consumer,
+respectively. The transportation cost is calculated by the introduced
+task-oriented maps, which focus on the category-wise and instance-wise
+differences among the various thing and stuff targets. Furthermore, a
+centroid-based scheme is proposed to set the accurate unit number for each gt
+point supplier. Hence, the pseudo-mask generation is converted into finding the
+optimal transport plan at a globally minimal transportation cost, which can be
+solved via the Sinkhorn-Knopp Iteration. Experimental results on Pascal VOC and
+COCO demonstrate the promising performance of our proposed Point2Mask approach
+to point-supervised panoptic segmentation. Source code is available at:
+https://github.com/LiWentomng/Point2Mask.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 figures, ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-based Prediction of Stress and Strain Maps in Arterial
+  Walls for Improved Cardiovascular Risk Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasin Shokrollahi1, Pengfei Dong1, Xianqi Li, Linxia Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigated the potential of end-to-end deep learning tools as a
+more effective substitute for FEM in predicting stress-strain fields within 2D
+cross sections of arterial wall. We first proposed a U-Net based fully
+convolutional neural network (CNN) to predict the von Mises stress and strain
+distribution based on the spatial arrangement of calcification within arterial
+wall cross-sections. Further, we developed a conditional generative adversarial
+network (cGAN) to enhance, particularly from the perceptual perspective, the
+prediction accuracy of stress and strain field maps for arterial walls with
+various calcification quantities and spatial configurations. On top of U-Net
+and cGAN, we also proposed their ensemble approaches, respectively, to further
+improve the prediction accuracy of field maps. Our dataset, consisting of input
+and output images, was generated by implementing boundary conditions and
+extracting stress-strain field maps. The trained U-Net models can accurately
+predict von Mises stress and strain fields, with structural similarity index
+scores (SSIM) of 0.854 and 0.830 and mean squared errors of 0.017 and 0.018 for
+stress and strain, respectively, on a reserved test set. Meanwhile, the cGAN
+models in a combination of ensemble and transfer learning techniques
+demonstrate high accuracy in predicting von Mises stress and strain fields, as
+evidenced by SSIM scores of 0.890 for stress and 0.803 for strain.
+Additionally, mean squared errors of 0.008 for stress and 0.017 for strain
+further support the model's performance on a designated test set. Overall, this
+study developed a surrogate model for finite element analysis, which can
+accurately and efficiently predict stress-strain fields of arterial walls
+regardless of complex geometries and boundary conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Focus on Content not Noise: Improving Image Generation for Nuclei
+  Segmentation by Suppressing Steganography in CycleGAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Utz, Tobias Weise, Maja Schlereth, Fabian Wagner, Mareike Thies, Mingxuan Gu, Stefan Uderhardt, Katharina Breininger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Annotating nuclei in microscopy images for the training of neural networks is
+a laborious task that requires expert knowledge and suffers from inter- and
+intra-rater variability, especially in fluorescence microscopy. Generative
+networks such as CycleGAN can inverse the process and generate synthetic
+microscopy images for a given mask, thereby building a synthetic dataset.
+However, past works report content inconsistencies between the mask and
+generated image, partially due to CycleGAN minimizing its loss by hiding
+shortcut information for the image reconstruction in high frequencies rather
+than encoding the desired image content and learning the target task. In this
+work, we propose to remove the hidden shortcut information, called
+steganography, from generated images by employing a low pass filtering based on
+the DCT. We show that this increases coherence between generated images and
+cycled masks and evaluate synthetic datasets on a downstream nuclei
+segmentation task. Here we achieve an improvement of 5.4 percentage points in
+the F1-score compared to a vanilla CycleGAN. Integrating advanced
+regularization techniques into the CycleGAN architecture may help mitigate
+steganography-related issues and produce more accurate synthetic datasets for
+nuclei segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Tensor Decomposition of arbitrary order based on Block
+  Convolution with Reflective Boundary Conditions for Multi-Dimensional Data
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Molavi, Mansoor Rezghi, Tayyebeh Saeedi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor decompositions are powerful tools for analyzing multi-dimensional data
+in their original format. Besides tensor decompositions like Tucker and CP,
+Tensor SVD (t-SVD) which is based on the t-product of tensors is another
+extension of SVD to tensors that recently developed and has found numerous
+applications in analyzing high dimensional data. This paper offers a new
+insight into the t-Product and shows that this product is a block convolution
+of two tensors with periodic boundary conditions. Based on this viewpoint, we
+propose a new tensor-tensor product called the $\star_c{}\text{-Product}$ based
+on Block convolution with reflective boundary conditions. Using a tensor
+framework, this product can be easily extended to tensors of arbitrary order.
+Additionally, we introduce a tensor decomposition based on our
+$\star_c{}\text{-Product}$ for arbitrary order tensors. Compared to t-SVD, our
+new decomposition has lower complexity, and experiments show that it yields
+higher-quality results in applications such as classification and compression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PoissonNet: Resolution-Agnostic 3D Shape Reconstruction using Fourier
+  Neural Operators <span class="chip">3DV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hector Andrade-Loarca, Aras Bacho, Julius Hege, Gitta Kutyniok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PoissonNet, an architecture for shape reconstruction that
+addresses the challenge of recovering 3D shapes from points. Traditional deep
+neural networks face challenges with common 3D shape discretization techniques
+due to their computational complexity at higher resolutions. To overcome this,
+we leverage Fourier Neural Operators (FNOs) to solve the Poisson equation and
+reconstruct a mesh from oriented point cloud measurements. PoissonNet exhibits
+two main advantages. First, it enables efficient training on low-resolution
+data while achieving comparable performance at high-resolution evaluation,
+thanks to the resolution-agnostic nature of FNOs. This feature allows for
+one-shot super-resolution. Second, our method surpasses existing approaches in
+reconstruction quality while being differentiable. Overall, our proposed method
+not only improves upon the limitations of classical deep neural networks in
+shape reconstruction but also achieves superior results in terms of
+reconstruction quality, running time, and resolution flexibility. Furthermore,
+we demonstrate that the Poisson surface reconstruction problem is well-posed in
+the limit case by showing a universal approximation theorem for the solution
+operator of the Poisson equation with distributional data utilizing the Fourier
+Neuronal Operator, which provides a theoretical foundation for our numerical
+results. The code to reproduce the experiments is available on:
+\url{https://github.com/arsenal9971/PoissonNet}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to 3DV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NuInsSeg: A Fully Annotated <span class="highlight-title">Dataset</span> for Nuclei Instance Segmentation in
+  H&E-Stained Histological Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01760v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01760v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirreza Mahbod, Christine Polak, Katharina Feldmann, Rumsha Khan, Katharina Gelles, Georg Dorffner, Ramona Woitek, Sepideh Hatamikia, Isabella Ellinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In computational pathology, automatic nuclei instance segmentation plays an
+essential role in whole slide image analysis. While many computerized
+approaches have been proposed for this task, supervised deep learning (DL)
+methods have shown superior segmentation performances compared to classical
+machine learning and image processing techniques. However, these models need
+fully annotated datasets for training which is challenging to acquire,
+especially in the medical domain. In this work, we release one of the biggest
+fully manually annotated datasets of nuclei in Hematoxylin and Eosin
+(H&E)-stained histological images, called NuInsSeg. This dataset contains 665
+image patches with more than 30,000 manually segmented nuclei from 31 human and
+mouse organs. Moreover, for the first time, we provide additional ambiguous
+area masks for the entire dataset. These vague areas represent the parts of the
+images where precise and deterministic manual annotations are impossible, even
+for human experts. The dataset and detailed step-by-step instructions to
+generate related segmentation masks are publicly available at
+https://www.kaggle.com/datasets/ipateam/nuinsseg and
+https://github.com/masih4/NuInsSeg, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 1 Figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Collapse Terminus: A Unified Solution for Class Incremental
+  Learning and Its Variants <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Yang, Haobo Yuan, Xiangtai Li, Jianlong Wu, Lefei Zhang, Zhouchen Lin, Philip Torr, Dacheng Tao, Bernard Ghanem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to enable learnability for new classes while keeping the capability well
+on old classes has been a crucial challenge for class incremental learning.
+Beyond the normal case, long-tail class incremental learning and few-shot class
+incremental learning are also proposed to consider the data imbalance and data
+scarcity, respectively, which are common in real-world implementations and
+further exacerbate the well-known problem of catastrophic forgetting. Existing
+methods are specifically proposed for one of the three tasks. In this paper, we
+offer a unified solution to the misalignment dilemma in the three tasks.
+Concretely, we propose neural collapse terminus that is a fixed structure with
+the maximal equiangular inter-class separation for the whole label space. It
+serves as a consistent target throughout the incremental training to avoid
+dividing the feature space incrementally. For CIL and LTCIL, we further propose
+a prototype evolving scheme to drive the backbone features into our neural
+collapse terminus smoothly. Our method also works for FSCIL with only minor
+adaptations. Theoretical analysis indicates that our method holds the neural
+collapse optimality in an incremental fashion regardless of data imbalance or
+data scarcity. We also design a generalized case where we do not know the total
+number of classes and whether the data distribution is normal, long-tail, or
+few-shot for each coming session, to test the generalizability of our method.
+Extensive experiments with multiple datasets are conducted to demonstrate the
+effectiveness of our unified solution to all the three tasks and the
+generalized case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of our ICLR 2023 paper
+  https://openreview.net/pdf?id=y5W8tpojhtJ. arXiv admin note: text overlap
+  with arXiv:2302.03004</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Visibility in Nighttime Haze Images Using Guided APSF and
+  Gradient Adaptive Convolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeying Jin, Beibei Lin, Wending Yan, Wei Ye, Yuan Yuan, Robby T. Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visibility in hazy nighttime scenes is frequently reduced by multiple
+factors, including low light, intense glow, light scattering, and the presence
+of multicolored light sources. Existing nighttime dehazing methods often
+struggle with handling glow or low-light conditions, resulting in either
+excessively dark visuals or unsuppressed glow outputs. In this paper, we
+enhance the visibility from a single nighttime haze image by suppressing glow
+and enhancing low-light regions. To handle glow effects, our framework learns
+from the rendered glow pairs. Specifically, a light source aware network is
+proposed to detect light sources of night images, followed by the APSF (Angular
+Point Spread Function)-guided glow rendering. Our framework is then trained on
+the rendered images, resulting in glow suppression. Moreover, we utilize
+gradient-adaptive convolution, to capture edges and textures in hazy scenes. By
+leveraging extracted edges and textures, we enhance the contrast of the scene
+without losing important structural details. To boost low-light intensity, our
+network learns an attention map, then adjusted by gamma correction. This
+attention has high values on low-light regions and low values on haze and glow
+regions. Extensive evaluation on real nighttime haze images, demonstrates the
+effectiveness of our method. Our experiments demonstrate that our method
+achieves a PSNR of 30.72dB, outperforming state-of-the-art methods by 14$\%$ on
+GTA5 nighttime haze dataset. Our data and code is available at:
+\url{https://github.com/jinyeying/nighttime_dehaze}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACMMM2023, https://github.com/jinyeying/nighttime_dehaze</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantification of Predictive Uncertainty via Inference-Time Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarína Tóthová, Ľubor Ladický, Daniel Thul, Marc Pollefeys, Ender Konukoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive variability due to data ambiguities has typically been addressed
+via construction of dedicated models with built-in probabilistic capabilities
+that are trained to predict uncertainty estimates as variables of interest.
+These approaches require distinct architectural components and training
+mechanisms, may include restrictive assumptions and exhibit overconfidence,
+i.e., high confidence in imprecise predictions. In this work, we propose a
+post-hoc sampling strategy for estimating predictive uncertainty accounting for
+data ambiguity. The method can generate different plausible outputs for a given
+input and does not assume parametric forms of predictive distributions. It is
+architecture agnostic and can be applied to any feed-forward deterministic
+network without changes to the architecture or training procedure. Experiments
+on regression tasks on imaging and non-imaging input data show the method's
+ability to generate diverse and multi-modal predictive distributions, and a
+desirable correlation of the estimated uncertainty with the prediction error.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised 3D Instance Segmentation without Instance-level
+  Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shichao Dong, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D semantic scene understanding tasks have achieved great success with the
+emergence of deep learning, but often require a huge amount of manually
+annotated training data. To alleviate the annotation cost, we propose the first
+weakly-supervised 3D instance segmentation method that only requires
+categorical semantic labels as supervision, and we do not need instance-level
+labels. The required semantic annotations can be either dense or extreme sparse
+(e.g. 0.02% of total points). Even without having any instance-related
+ground-truth, we design an approach to break point clouds into raw fragments
+and find the most confident samples for learning instance centroids.
+Furthermore, we construct a recomposed dataset using pseudo instances, which is
+used to learn our defined multilevel shape-aware objectness signal. An
+asymmetrical object inference algorithm is followed to process core points and
+boundary points with different strategies, and generate high-quality pseudo
+instance labels to guide iterative training. Experiments demonstrate that our
+method can achieve comparable results with recent fully supervised methods. By
+generating pseudo instance labels from categorical semantic labels, our
+designed approach can also assist existing methods for learning 3D instance
+segmentation at reduced annotation cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bees Local Phase Quantization Feature Selection for RGB-D Facial
+  Expressions Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Muhammad Hossein Mousavi, Atiye Ilanloo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature selection could be defined as an optimization problem and solved by
+bio-inspired algorithms. Bees Algorithm (BA) shows decent performance in
+feature selection optimization tasks. On the other hand, Local Phase
+Quantization (LPQ) is a frequency domain feature which has excellent
+performance on Depth images. Here, after extracting LPQ features out of RGB
+(colour) and Depth images from the Iranian Kinect Face Database (IKFDB), the
+Bees feature selection algorithm applies to select the desired number of
+features for final classification tasks. IKFDB is recorded with Kinect sensor
+V.2 and contains colour and depth images for facial and facial
+micro-expressions recognition purposes. Here five facial expressions of Anger,
+Joy, Surprise, Disgust and Fear are used for final validation. The proposed
+Bees LPQ method is compared with Particle Swarm Optimization (PSO) LPQ, PCA
+LPQ, Lasso LPQ, and just LPQ features for classification tasks with Support
+Vector Machines (SVM), K-Nearest Neighbourhood (KNN), Shallow Neural Network
+and Ensemble Subspace KNN. Returned results, show a decent performance of the
+proposed algorithm (99 % accuracy) in comparison with others.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The International Workshop on the Bees Algorithm and its
+  Applications, Birmingham, UK
+  (https://sites.google.com/view/baaworkshop/baa-past-events/2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balanced Destruction-Reconstruction Dynamics for Memory-replay Class
+  Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Zhou, Jiangchao Yao, Feng Hong, Ya Zhang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class incremental learning (CIL) aims to incrementally update a trained model
+with the new classes of samples (plasticity) while retaining previously learned
+ability (stability). To address the most challenging issue in this goal, i.e.,
+catastrophic forgetting, the mainstream paradigm is memory-replay CIL, which
+consolidates old knowledge by replaying a small number of old classes of
+samples saved in the memory. Despite effectiveness, the inherent
+destruction-reconstruction dynamics in memory-replay CIL are an intrinsic
+limitation: if the old knowledge is severely destructed, it will be quite hard
+to reconstruct the lossless counterpart. Our theoretical analysis shows that
+the destruction of old knowledge can be effectively alleviated by balancing the
+contribution of samples from the current phase and those saved in the memory.
+Motivated by this theoretical finding, we propose a novel Balanced
+Destruction-Reconstruction module (BDR) for memory-replay CIL, which can
+achieve better knowledge reconstruction by reducing the degree of maximal
+destruction of old knowledge. Specifically, to achieve a better balance between
+old knowledge and new classes, the proposed BDR module takes into account two
+factors: the variance in training status across different classes and the
+quantity imbalance of samples from the current phase and memory. By dynamically
+manipulating the gradient during training based on these factors, BDR can
+effectively alleviate knowledge destruction and improve knowledge
+reconstruction. Extensive experiments on a range of CIL benchmarks have shown
+that as a lightweight plug-and-play module, BDR can significantly improve the
+performance of existing state-of-the-art methods with good generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiDAR-Camera Panoptic Segmentation via Geometry-Consistent and
+  Semantic-Aware Alignment <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01686v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01686v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Zhang, Zhizhong Zhang, Qian Yu, Ran Yi, Yuan Xie, Lizhuang Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D panoptic segmentation is a challenging perception task that requires both
+semantic segmentation and instance segmentation. In this task, we notice that
+images could provide rich texture, color, and discriminative information, which
+can complement LiDAR data for evident performance improvement, but their fusion
+remains a challenging problem. To this end, we propose LCPS, the first
+LiDAR-Camera Panoptic Segmentation network. In our approach, we conduct
+LiDAR-Camera fusion in three stages: 1) an Asynchronous Compensation Pixel
+Alignment (ACPA) module that calibrates the coordinate misalignment caused by
+asynchronous problems between sensors; 2) a Semantic-Aware Region Alignment
+(SARA) module that extends the one-to-one point-pixel mapping to one-to-many
+semantic relations; 3) a Point-to-Voxel feature Propagation (PVP) module that
+integrates both geometric and semantic fusion information for the entire point
+cloud. Our fusion strategy improves about 6.9% PQ performance over the
+LiDAR-only baseline on NuScenes dataset. Extensive quantitative and qualitative
+experiments further demonstrate the effectiveness of our novel framework. The
+code will be released at https://github.com/zhangzw12319/lcps.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as ICCV 2023 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BEVControl: Accurately Controlling Street-view Elements with
+  Multi-perspective Consistency via BEV Sketch Layout 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01661v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01661v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kairui Yang, Enhui Ma, Jibin Peng, Qing Guo, Di Lin, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using synthesized images to boost the performance of perception models is a
+long-standing research challenge in computer vision. It becomes more eminent in
+visual-centric autonomous driving systems with multi-view cameras as some
+long-tail scenarios can never be collected. Guided by the BEV segmentation
+layouts, the existing generative networks seem to synthesize photo-realistic
+street-view images when evaluated solely on scene-level metrics. However, once
+zoom-in, they usually fail to produce accurate foreground and background
+details such as heading. To this end, we propose a two-stage generative method,
+dubbed BEVControl, that can generate accurate foreground and background
+contents. In contrast to segmentation-like input, it also supports sketch style
+input, which is more flexible for humans to edit. In addition, we propose a
+comprehensive multi-level evaluation protocol to fairly compare the quality of
+the generated scene, foreground object, and background geometry. Our extensive
+experiments show that our BEVControl surpasses the state-of-the-art method,
+BEVGen, by a significant margin, from 5.89 to 26.80 on foreground segmentation
+mIoU. In addition, we show that using images generated by BEVControl to train
+the downstream perception model, it achieves on average 1.29 improvement in NDS
+score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffColor: Toward High Fidelity Text-Guided Image Colorization with
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianxin Lin, Peng Xiao, Yijun Wang, Rongju Zhang, Xiangxiang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent data-driven image colorization methods have enabled automatic or
+reference-based colorization, while still suffering from unsatisfactory and
+inaccurate object-level color control. To address these issues, we propose a
+new method called DiffColor that leverages the power of pre-trained diffusion
+models to recover vivid colors conditioned on a prompt text, without any
+additional inputs. DiffColor mainly contains two stages: colorization with
+generative color prior and in-context controllable colorization. Specifically,
+we first fine-tune a pre-trained text-to-image model to generate colorized
+images using a CLIP-based contrastive loss. Then we try to obtain an optimized
+text embedding aligning the colorized image and the text prompt, and a
+fine-tuned diffusion model enabling high-quality image reconstruction. Our
+method can produce vivid and diverse colors with a few iterations, and keep the
+structure and background intact while having colors well-aligned with the
+target language guidance. Moreover, our method allows for in-context
+colorization, i.e., producing different colorization results by modifying
+prompt texts without any fine-tuning, and can achieve object-level controllable
+colorization results. Extensive experiments and user studies demonstrate that
+DiffColor outperforms previous works in terms of visual quality, color
+fidelity, and diversity of colorization options.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-scale Cross-restoration Framework for Electrocardiogram Anomaly
+  Detection <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aofan Jiang, Chaoqin Huang, Qing Cao, Shuang Wu, Zi Zeng, Kang Chen, Ya Zhang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrocardiogram (ECG) is a widely used diagnostic tool for detecting heart
+conditions. Rare cardiac diseases may be underdiagnosed using traditional ECG
+analysis, considering that no training dataset can exhaust all possible cardiac
+disorders. This paper proposes using anomaly detection to identify any
+unhealthy status, with normal ECGs solely for training. However, detecting
+anomalies in ECG can be challenging due to significant inter-individual
+differences and anomalies present in both global rhythm and local morphology.
+To address this challenge, this paper introduces a novel multi-scale
+cross-restoration framework for ECG anomaly detection and localization that
+considers both local and global ECG characteristics. The proposed framework
+employs a two-branch autoencoder to facilitate multi-scale feature learning
+through a masking and restoration process, with one branch focusing on global
+features from the entire ECG and the other on local features from
+heartbeat-level details, mimicking the diagnostic process of cardiologists.
+Anomalies are identified by their high restoration errors. To evaluate the
+performance on a large number of individuals, this paper introduces a new
+challenging benchmark with signal point-level ground truths annotated by
+experienced cardiologists. The proposed method demonstrates state-of-the-art
+performance on this benchmark and two other well-known ECG datasets. The
+benchmark dataset and source code are available at:
+\url{https://github.com/MediaBrain-SJTU/ECGAD}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023 Early Accept</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling Multi-view Representations Beyond Inductive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanzhou Ke, Yang Yu, Guoqing Chao, Xiaoli Wang,  Chenyang,  Xu, Shengfeng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view (or -modality) representation learning aims to understand the
+relationships between different view representations. Existing methods
+disentangle multi-view representations into consistent and view-specific
+representations by introducing strong inductive biases, which can limit their
+generalization ability. In this paper, we propose a novel multi-view
+representation disentangling method that aims to go beyond inductive biases,
+ensuring both interpretability and generalizability of the resulting
+representations. Our method is based on the observation that discovering
+multi-view consistency in advance can determine the disentangling information
+boundary, leading to a decoupled learning objective. We also found that the
+consistency can be easily extracted by maximizing the transformation invariance
+and clustering consistency between views. These observations drive us to
+propose a two-stage framework. In the first stage, we obtain multi-view
+consistency by training a consistent encoder to produce semantically-consistent
+representations across views as well as their corresponding pseudo-labels. In
+the second stage, we disentangle specificity from comprehensive representations
+by minimizing the upper bound of mutual information between consistent and
+comprehensive representations. Finally, we reconstruct the original data by
+concatenating pseudo-labels and view-specific representations. Our experiments
+on four multi-view datasets demonstrate that our proposed method outperforms 12
+comparison methods in terms of clustering and classification performance. The
+visualization results also show that the extracted consistency and specificity
+are compact and interpretable. Our code can be found at
+\url{https://github.com/Guanzhou-Ke/DMRIB}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Erasure-based Interaction Network for RGBT Video Object Detection and A
+  Unified Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengzheng Tu, Qishun Wang, Hongshun Wang, Kunpeng Wang, Chenglong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, many breakthroughs are made in the field of Video Object Detection
+(VOD), but the performance is still limited due to the imaging limitations of
+RGB sensors in adverse illumination conditions. To alleviate this issue, this
+work introduces a new computer vision task called RGB-thermal (RGBT) VOD by
+introducing the thermal modality that is insensitive to adverse illumination
+conditions. To promote the research and development of RGBT VOD, we design a
+novel Erasure-based Interaction Network (EINet) and establish a comprehensive
+benchmark dataset (VT-VOD50) for this task. Traditional VOD methods often
+leverage temporal information by using many auxiliary frames, and thus have
+large computational burden. Considering that thermal images exhibit less noise
+than RGB ones, we develop a negative activation function that is used to erase
+the noise of RGB features with the help of thermal image features. Furthermore,
+with the benefits from thermal images, we rely only on a small temporal window
+to model the spatio-temporal information to greatly improve efficiency while
+maintaining detection accuracy.
+  VT-VOD50 dataset consists of 50 pairs of challenging RGBT video sequences
+with complex backgrounds, various objects and different illuminations, which
+are collected in real traffic scenarios. Extensive experiments on VT-VOD50
+dataset demonstrate the effectiveness and efficiency of our proposed method
+against existing mainstream VOD methods. The code of EINet and the dataset will
+be released to the public for free academic usage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interleaving GANs with knowledge graphs to support design creativity for
+  book covers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandru Motogna, Adrian Groza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An attractive book cover is important for the success of a book. In this
+paper, we apply Generative Adversarial Networks (GANs) to the book covers
+domain, using different methods for training in order to obtain better
+generated images. We interleave GANs with knowledge graphs to alter the input
+title to obtain multiple possible options for any given title, which are then
+used as an augmented input to the generator. Finally, we use the discriminator
+obtained during the training phase to select the best images generated with new
+titles. Our method performed better at generating book covers than previous
+attempts, and the knowledge graph gives better options to the book author or
+editor compared to using GANs alone.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReIDTrack: Multi-Object Track and Segmentation Without Motion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaer Huang, Bingchuan Sun, Feng Chen, Tao Zhang, Jun Xie, Jian Li, Christopher Walter Twombly, Zhepeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, dominant Multi-object tracking (MOT) and segmentation (MOTS)
+methods mainly follow the tracking-by-detection paradigm. Transformer-based
+end-to-end (E2E) solutions bring some ideas to MOT and MOTS, but they cannot
+achieve a new state-of-the-art (SOTA) performance in major MOT and MOTS
+benchmarks. Detection and association are two main modules of the
+tracking-by-detection paradigm. Association techniques mainly depend on the
+combination of motion and appearance information. As deep learning has been
+recently developed, the performance of the detection and appearance model is
+rapidly improved. These trends made us consider whether we can achieve SOTA
+based on only high-performance detection and appearance model. Our paper mainly
+focuses on exploring this direction based on CBNetV2 with Swin-B as a detection
+model and MoCo-v2 as a self-supervised appearance model. Motion information and
+IoU mapping were removed during the association. Our method wins 1st place on
+the MOTS track and wins 2nd on the MOT track in the CVPR2023 WAD workshop. We
+hope our simple and effective method can give some insights to the MOT and MOTS
+research community. Source code will be released under this git repository
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Convolutional Neural Network Architecture with a Continuous
+  Symmetry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Liu, Hang Shao, Bing Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new Convolutional Neural Network (ConvNet)
+architecture inspired by a class of partial differential equations (PDEs)
+called quasi-linear hyperbolic systems. With comparable performance on image
+classification task, it allows for the modification of the weights via a
+continuous group of symmetry. This is a significant shift from traditional
+models where the architecture and weights are essentially fixed. We wish to
+promote the (internal) symmetry as a new desirable property for a neural
+network, and to draw attention to the PDE perspective in analyzing and
+interpreting ConvNets in the broader Deep Learning community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 3rd CAAI International Conference on Artificial
+  Intelligence (CICAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Deep Learning-based Spatio-temporal Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wang, Fanwei Zeng, Yuntao Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal action detection (STAD) aims to classify the actions present
+in a video and localize them in space and time. It has become a particularly
+active area of research in computer vision because of its explosively emerging
+real-world applications, such as autonomous driving, visual surveillance,
+entertainment, etc. Many efforts have been devoted in recent years to building
+a robust and effective framework for STAD. This paper provides a comprehensive
+review of the state-of-the-art deep learning-based methods for STAD. Firstly, a
+taxonomy is developed to organize these methods. Next, the linking algorithms,
+which aim to associate the frame- or clip-level detection results together to
+form action tubes, are reviewed. Then, the commonly used benchmark datasets and
+evaluation metrics are introduced, and the performance of state-of-the-art
+models is compared. At last, this paper is concluded, and a set of potential
+research directions of STAD are discussed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Systematic Weaknesses of DNNs using Counterfactuals <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01614v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01614v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sujan Sai Gannamaneni, Michael Mock, Maram Akila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of DNNs into safety-critical applications, testing
+approaches for such models have gained more attention. A current direction is
+the search for and identification of systematic weaknesses that put safety
+assumptions based on average performance values at risk. Such weaknesses can
+take on the form of (semantically coherent) subsets or areas in the input space
+where a DNN performs systematically worse than its expected average. However,
+it is non-trivial to attribute the reason for such observed low performances to
+the specific semantic features that describe the subset. For instance,
+inhomogeneities within the data w.r.t. other (non-considered) attributes might
+distort results. However, taking into account all (available) attributes and
+their interaction is often computationally highly expensive. Inspired by
+counterfactual explanations, we propose an effective and computationally cheap
+algorithm to validate the semantic attribution of existing subsets, i.e., to
+check whether the identified attribute is likely to have caused the degraded
+performance. We demonstrate this approach on an example from the autonomous
+driving domain using highly annotated simulated data, where we show for a
+semantic segmentation model that (i) performance differences among the
+different pedestrian assets exist, but (ii) only in some cases is the asset
+type itself the reason for this reduction in the performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI Spring Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Light Estimation and Neural Soft Shadows for AR Indoor
+  Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Sommer, Ulrich Schwanecke, Elmar Schömer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a pipeline for realistic embedding of virtual objects into footage
+of indoor scenes with focus on real-time AR applications. Our pipeline consists
+of two main components: A light estimator and a neural soft shadow texture
+generator. Our light estimation is based on deep neural nets and determines the
+main light direction, light color, ambient color and an opacity parameter for
+the shadow texture. Our neural soft shadow method encodes object-based
+realistic soft shadows as light direction dependent textures in a small MLP. We
+show that our pipeline can be used to integrate objects into AR scenes in a new
+level of realism in real-time. Our models are small enough to run on current
+mobile devices. We achieve runtimes of 9ms for light estimation and 5ms for
+neural shadows on an iPhone 11 Pro.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IndoHerb: Indonesia Medicinal Plants Recognition using Transfer Learning
+  and Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01604v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01604v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Salman Ikrar Musyaffa, Novanto Yudistira, Muhammad Arif Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Herbal plants are nutritious plants that can be used as an alternative to
+traditional disease healing. In Indonesia there are various types of herbal
+plants. But with the development of the times, the existence of herbal plants
+as traditional medicines began to be forgotten so that not everyone could
+recognize them. Having the ability to identify herbal plants can have many
+positive impacts. However, there is a problem where identifying plants can take
+a long time because it requires in-depth knowledge and careful examination of
+plant criteria. So that the application of computer vision can help identify
+herbal plants. Previously, research had been conducted on the introduction of
+herbal plants from Vietnam using several algorithms, but from these research
+the accuracy was not high enough. Therefore, this study intends to implement
+transfer learning from the Convolutional Neural Network (CNN) algorithm to
+classify types of herbal plants from Indonesia. This research was conducted by
+collecting image data of herbal plants from Indonesia independently through the
+Google Images search engine. After that, it will go through the data
+preprocessing, classification using the transfer learning method from CNN, and
+analysis will be carried out. The CNN transfer learning models used are
+ResNet34, DenseNet121, and VGG11_bn. Based on the test results of the three
+models, it was found that DenseNet121 was the model with the highest accuracy,
+which was 87.4%. In addition, testing was also carried out using the scratch
+model and obtained an accuracy of 43.53%. The Hyperparameter configuration used
+in this test is the ExponentialLR scheduler with a gamma value of 0.9; learning
+rate 0.001; Cross Entropy Loss function; Adam optimizer; and the number of
+epochs is 50. Indonesia Medicinal Plant Dataset can be accessed at the
+following link https://github.com/Salmanim20/indo_medicinal_plant
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reference-Free Isotropic 3D EM Reconstruction using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyungryun Lee, Won-Ki Jeong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electron microscopy (EM) images exhibit anisotropic axial resolution due to
+the characteristics inherent to the imaging modality, presenting challenges in
+analysis and downstream tasks.In this paper, we propose a diffusion-model-based
+framework that overcomes the limitations of requiring reference data or prior
+knowledge about the degradation process. Our approach utilizes 2D diffusion
+models to consistently reconstruct 3D volumes and is well-suited for highly
+downsampled data. Extensive experiments conducted on two public datasets
+demonstrate the robustness and superiority of leveraging the generative prior
+compared to supervised learning methods. Additionally, we demonstrate our
+method's feasibility for self-supervised reconstruction, which can restore a
+single anisotropic volume without any training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistency Regularization for Generalizable Source-free Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longxiang Tang, Kai Li, Chunming He, Yulun Zhang, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free domain adaptation (SFDA) aims to adapt a well-trained source
+model to an unlabelled target domain without accessing the source dataset,
+making it applicable in a variety of real-world scenarios. Existing SFDA
+methods ONLY assess their adapted models on the target training set, neglecting
+the data from unseen but identically distributed testing sets. This oversight
+leads to overfitting issues and constrains the model's generalization ability.
+In this paper, we propose a consistency regularization framework to develop a
+more generalizable SFDA method, which simultaneously boosts model performance
+on both target training and testing datasets. Our method leverages soft
+pseudo-labels generated from weakly augmented images to supervise strongly
+augmented images, facilitating the model training process and enhancing the
+generalization ability of the adapted model. To leverage more potentially
+useful supervision, we present a sampling-based pseudo-label selection
+strategy, taking samples with severer domain shift into consideration.
+Moreover, global-oriented calibration methods are introduced to exploit global
+class distribution and feature cluster information, further improving the
+adaptation process. Extensive experiments demonstrate our method achieves
+state-of-the-art performance on several SFDA benchmarks, and exhibits
+robustness on unseen testing datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MVFlow: Deep Optical Flow Estimation of Compressed Videos with Motion
+  Vector Prior <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shili Zhou, Xuhao Jiang, Weimin Tan, Ruian He, Bo Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, many deep learning-based methods have been proposed to
+tackle the problem of optical flow estimation and achieved promising results.
+However, they hardly consider that most videos are compressed and thus ignore
+the pre-computed information in compressed video streams. Motion vectors, one
+of the compression information, record the motion of the video frames. They can
+be directly extracted from the compression code stream without computational
+cost and serve as a solid prior for optical flow estimation. Therefore, we
+propose an optical flow model, MVFlow, which uses motion vectors to improve the
+speed and accuracy of optical flow estimation for compressed videos. In detail,
+MVFlow includes a key Motion-Vector Converting Module, which ensures that the
+motion vectors can be transformed into the same domain of optical flow and then
+be utilized fully by the flow estimation module. Meanwhile, we construct four
+optical flow datasets for compressed videos containing frames and motion
+vectors in pairs. The experimental results demonstrate the superiority of our
+proposed MVFlow, which can reduce the AEPE by 1.09 compared to existing models
+or save 52% time to achieve similar accuracy to existing models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Get the Best of Both Worlds: Improving Accuracy and Transferability by
+  Grassmann Class Representation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoqi Wang, Zhizhong Li, Wayne Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We generalize the class vectors found in neural networks to linear subspaces
+(i.e.~points in the Grassmann manifold) and show that the Grassmann Class
+Representation (GCR) enables the simultaneous improvement in accuracy and
+feature transferability. In GCR, each class is a subspace and the logit is
+defined as the norm of the projection of a feature onto the class subspace. We
+integrate Riemannian SGD into deep learning frameworks such that class
+subspaces in a Grassmannian are jointly optimized with the rest model
+parameters. Compared to the vector form, the representative capability of
+subspaces is more powerful. We show that on ImageNet-1K, the top-1 error of
+ResNet50-D, ResNeXt50, Swin-T and Deit3-S are reduced by 5.6%, 4.5%, 3.0% and
+3.5%, respectively. Subspaces also provide freedom for features to vary and we
+observed that the intra-class feature variability grows when the subspace
+dimension increases. Consequently, we found the quality of GCR features is
+better for downstream tasks. For ResNet50-D, the average linear transfer
+accuracy across 6 datasets improves from 77.98% to 79.70% compared to the
+strong baseline of vanilla softmax. For Swin-T, it improves from 81.5% to 83.4%
+and for Deit3, it improves from 73.8% to 81.4%. With these encouraging results,
+we believe that more applications could benefit from the Grassmann class
+representation. Code is released at https://github.com/innerlee/GCR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Neurons in <span class="highlight-title">Pretrain</span>ed Text-Only <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Schwettmann, Neil Chowdhury, Antonio Torralba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models demonstrate remarkable capacity to generalize representations
+learned in one modality to downstream tasks in other modalities. Can we trace
+this ability to individual neurons? We study the case where a frozen text
+transformer is augmented with vision using a self-supervised visual encoder and
+a single linear projection learned on an image-to-text task. Outputs of the
+projection layer are not immediately decodable into language describing image
+content; instead, we find that translation between modalities occurs deeper
+within the transformer. We introduce a procedure for identifying "multimodal
+neurons" that convert visual representations into corresponding text, and
+decoding the concepts they inject into the model's residual stream. In a series
+of experiments, we show that multimodal neurons operate on specific visual
+concepts across inputs, and have a systematic causal effect on image
+captioning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DMDC: Dynamic-mask-based dual camera design for snapshot Hyperspectral
+  Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Cai, Chengqian Jin, Feipeng Da
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods are developing rapidly in coded aperture snapshot
+spectral imaging (CASSI). The number of parameters and FLOPs of existing
+state-of-the-art methods (SOTA) continues to increase, but the reconstruction
+accuracy improves slowly. Current methods still face two problems: 1) The
+performance of the spatial light modulator (SLM) is not fully developed due to
+the limitation of fixed Mask coding. 2) The single input limits the network
+performance. In this paper we present a dynamic-mask-based dual camera system,
+which consists of an RGB camera and a CASSI system running in parallel. First,
+the system learns the spatial feature distribution of the scene based on the
+RGB images, then instructs the SLM to encode each scene, and finally sends both
+RGB and CASSI images to the network for reconstruction. We further designed the
+DMDC-net, which consists of two separate networks, a small-scale CNN-based
+dynamic mask network for dynamic adjustment of the mask and a multimodal
+reconstruction network for reconstruction using RGB and CASSI measurements.
+Extensive experiments on multiple datasets show that our method achieves more
+than 9 dB improvement in PSNR over the SOTA.
+(https://github.com/caizeyu1992/DMDC)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MFIM: Megapixel Facial Identity Manipulation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Na
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face swapping is a task that changes a facial identity of a given image to
+that of another person. In this work, we propose a novel face-swapping
+framework called Megapixel Facial Identity Manipulation (MFIM). The
+face-swapping model should achieve two goals. First, it should be able to
+generate a high-quality image. We argue that a model which is proficient in
+generating a megapixel image can achieve this goal. However, generating a
+megapixel image is generally difficult without careful model design. Therefore,
+our model exploits pretrained StyleGAN in the manner of GAN-inversion to
+effectively generate a megapixel image. Second, it should be able to
+effectively transform the identity of a given image. Specifically, it should be
+able to actively transform ID attributes (e.g., face shape and eyes) of a given
+image into those of another person, while preserving ID-irrelevant attributes
+(e.g., pose and expression). To achieve this goal, we exploit 3DMM that can
+capture various facial attributes. Specifically, we explicitly supervise our
+model to generate a face-swapped image with the desirable attributes using
+3DMM. We show that our model achieves state-of-the-art performance through
+extensive experiments. Furthermore, we propose a new operation called ID
+mixing, which creates a new identity by semantically mixing the identities of
+several people. It allows the user to customize the new identity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2022 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Adaptation of CLIP for Few-Shot Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazheng Xing, Mengmeng Wang, Xiaojun Hou, Guang Dai, Jingdong Wang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Applying large-scale pre-trained visual models like CLIP to few-shot action
+recognition tasks can benefit performance and efficiency. Utilizing the
+"pre-training, fine-tuning" paradigm makes it possible to avoid training a
+network from scratch, which can be time-consuming and resource-intensive.
+However, this method has two drawbacks. First, limited labeled samples for
+few-shot action recognition necessitate minimizing the number of tunable
+parameters to mitigate over-fitting, also leading to inadequate fine-tuning
+that increases resource consumption and may disrupt the generalized
+representation of models. Second, the video's extra-temporal dimension
+challenges few-shot recognition's effective temporal modeling, while
+pre-trained visual models are usually image models. This paper proposes a novel
+method called Multimodal Adaptation of CLIP (MA-CLIP) to address these issues.
+It adapts CLIP for few-shot action recognition by adding lightweight adapters,
+which can minimize the number of learnable parameters and enable the model to
+transfer across different tasks quickly. The adapters we design can combine
+information from video-text multimodal sources for task-oriented spatiotemporal
+modeling, which is fast, efficient, and has low training costs. Additionally,
+based on the attention mechanism, we design a text-guided prototype
+construction module that can fully utilize video-text information to enhance
+the representation of video prototypes. Our MA-CLIP is plug-and-play, which can
+be used in any different few-shot action recognition temporal alignment metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Augmentation for Human Behavior Analysis in Multi-Person
+  Conversations <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Li, Dan Guo, Guoliang Chen, Feiyang Liu, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the solution of our team HFUT-VUT for the
+MultiMediate Grand Challenge 2023 at ACM Multimedia 2023. The solution covers
+three sub-challenges: bodily behavior recognition, eye contact detection, and
+next speaker prediction. We select Swin Transformer as the baseline and exploit
+data augmentation strategies to address the above three tasks. Specifically, we
+crop the raw video to remove the noise from other parts. At the same time, we
+utilize data augmentation to improve the generalization of the model. As a
+result, our solution achieves the best results of 0.6262 for bodily behavior
+recognition in terms of mean average precision and the accuracy of 0.7771 for
+eye contact detection on the corresponding test set. In addition, our approach
+also achieves comparable results of 0.5281 for the next speaker prediction in
+terms of unweighted average recall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Solutions of HFUT-VUT Team at the ACM MM 2023 Grand Challenge
+  (MultiMediate: Multi-modal Behaviour Analysis for Artificial Mediation).
+  Accepted at ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VisAlign: <span class="highlight-title">Dataset</span> for Measuring the Degree of Alignment between AI and
+  Humans in Visual Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyoung Lee, Seungho Kim, Seunghyun Won, Joonseok Lee, Marzyeh Ghassemi, James Thorne, Jaeseok Choi, O-Kil Kwon, Edward Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI alignment refers to models acting towards human-intended goals,
+preferences, or ethical principles. Given that most large-scale deep learning
+models act as black boxes and cannot be manually controlled, analyzing the
+similarity between models and humans can be a proxy measure for ensuring AI
+safety. In this paper, we focus on the models' visual perception alignment with
+humans, further referred to as AI-human visual alignment. Specifically, we
+propose a new dataset for measuring AI-human visual alignment in terms of image
+classification, a fundamental task in machine perception. In order to evaluate
+AI-human visual alignment, a dataset should encompass samples with various
+scenarios that may arise in the real world and have gold human perception
+labels. Our dataset consists of three groups of samples, namely Must-Act (i.e.,
+Must-Classify), Must-Abstain, and Uncertain, based on the quantity and clarity
+of visual information in an image and further divided into eight categories.
+All samples have a gold human perception label; even Uncertain (severely
+blurry) sample labels were obtained via crowd-sourcing. The validity of our
+dataset is verified by sampling theory, statistical theories related to survey
+design, and experts in the related fields. Using our dataset, we analyze the
+visual alignment and reliability of five popular visual perception models and
+seven abstention methods. Our code and data is available at
+\url{https://github.com/jiyounglee-0523/VisAlign}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PPI-NET: End-to-End Parametric Primitive Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Xiaogang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In engineering applications, line, circle, arc, and point are collectively
+referred to as primitives, and they play a crucial role in path planning,
+simulation analysis, and manufacturing. When designing CAD models, engineers
+typically start by sketching the model's orthographic view on paper or a
+whiteboard and then translate the design intent into a CAD program. Although
+this design method is powerful, it often involves challenging and repetitive
+tasks, requiring engineers to perform numerous similar operations in each
+design. To address this conversion process, we propose an efficient and
+accurate end-to-end method that avoids the inefficiency and error accumulation
+issues associated with using auto-regressive models to infer parametric
+primitives from hand-drawn sketch images. Since our model samples match the
+representation format of standard CAD software, they can be imported into CAD
+software for solving, editing, and applied to downstream design tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2203.01305 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Multi-FaceForensics: An End-to-end Bi-grained Contrastive
+  Learning Approach for Multi-face Forgery Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Zhang, Honggang Qi, Yuezun Li, Siwei Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DeepFakes have raised serious societal concerns, leading to a great surge in
+detection-based forensics methods in recent years. Face forgery recognition is
+the conventional detection method that usually follows a two-phase pipeline: it
+extracts the face first and then determines its authenticity by classification.
+Since DeepFakes in the wild usually contain multiple faces, using face forgery
+detection methods is merely practical as they have to process faces in a
+sequel, i.e., only one face is processed at the same time. One straightforward
+way to address this issue is to integrate face extraction and forgery detection
+in an end-to-end fashion by adapting advanced object detection architectures.
+However, as these object detection architectures are designed to capture the
+semantic information of different object categories rather than the subtle
+forgery traces among the faces, the direct adaptation is far from optimal. In
+this paper, we describe a new end-to-end framework, Contrastive
+Multi-FaceForensics (COMICS), to enhance multi-face forgery detection. The core
+of the proposed framework is a novel bi-grained contrastive learning approach
+that explores effective face forgery traces at both the coarse- and
+fine-grained levels. Specifically, the coarse-grained level contrastive
+learning captures the discriminative features among positive and negative
+proposal pairs in multiple scales with the instruction of the proposal
+generator, and the fine-grained level contrastive learning captures the
+pixel-wise discrepancy between the forged and original areas of the same face
+and the pixel-wise content inconsistency between different faces. Extensive
+experiments on the OpenForensics dataset demonstrate our method outperforms
+other counterparts by a large margin (~18.5%) and shows great potential for
+integration into various architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Circumventing Concept Erasure Methods For Text-to-Image Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh Pham, Kelly O. Marshall, Chinmay Hegde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generative models can produce photo-realistic images for an
+extremely broad range of concepts, and their usage has proliferated widely
+among the general public. On the flip side, these models have numerous
+drawbacks, including their potential to generate images featuring sexually
+explicit content, mirror artistic styles without permission, or even
+hallucinate (or deepfake) the likenesses of celebrities. Consequently, various
+methods have been proposed in order to "erase" sensitive concepts from
+text-to-image models. In this work, we examine five recently proposed concept
+erasure methods, and show that targeted concepts are not fully excised from any
+of these methods. Specifically, we leverage the existence of special learned
+word embeddings that can retrieve "erased" concepts from the sanitized models
+with no alterations to their weights. Our results highlight the brittleness of
+post hoc concept erasure methods, and call into question their use in the
+algorithmic toolkit for AI safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TDMD: A Database for Dynamic Color Mesh Subjective and Objective Quality
+  Explorations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Yang, Joel Jung, Timon Deschamps, Xiaozhong Xu, Shan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic colored meshes (DCM) are widely used in various applications;
+however, these meshes may undergo different processes, such as compression or
+transmission, which can distort them and degrade their quality. To facilitate
+the development of objective metrics for DCMs and study the influence of
+typical distortions on their perception, we create the Tencent - dynamic
+colored mesh database (TDMD) containing eight reference DCM objects with six
+typical distortions. Using processed video sequences (PVS) derived from the
+DCM, we have conducted a large-scale subjective experiment that resulted in 303
+distorted DCM samples with mean opinion scores, making the TDMD the largest
+available DCM database to our knowledge. This database enabled us to study the
+impact of different types of distortion on human perception and offer
+recommendations for DCM compression and related tasks. Additionally, we have
+evaluated three types of state-of-the-art objective metrics on the TDMD,
+including image-based, point-based, and video-based metrics, on the TDMD. Our
+experimental results highlight the strengths and weaknesses of each metric, and
+we provide suggestions about the selection of metrics in practical DCM
+applications. The TDMD will be made publicly available at the following
+location: https://multimedia.tencent.com/resources/tdmd.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient neural supersampling on a novel gaming <span class="highlight-title">dataset</span> <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01483v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01483v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Mercier, Ruan Erasmus, Yashesh Savani, Manik Dhingra, Fatih Porikli, Guillaume Berger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time rendering for video games has become increasingly challenging due
+to the need for higher resolutions, framerates and photorealism. Supersampling
+has emerged as an effective solution to address this challenge. Our work
+introduces a novel neural algorithm for supersampling rendered content that is
+4 times more efficient than existing methods while maintaining the same level
+of accuracy. Additionally, we introduce a new dataset which provides auxiliary
+modalities such as motion vectors and depth generated using graphics rendering
+features like viewport jittering and mipmap biasing at different resolutions.
+We believe that this dataset fills a gap in the current dataset landscape and
+can serve as a valuable resource to help measure progress in the field and
+advance the state-of-the-art in super-resolution techniques for gaming content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Model Adaptation for Continual Learning at the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary A. Daniels, Jun Hu, Michael Lomnitz, Phil Miller, Aswin Raghavan, Joe Zhang, Michael Piacentino, David Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most machine learning (ML) systems assume stationary and matching data
+distributions during training and deployment. This is often a false assumption.
+When ML models are deployed on real devices, data distributions often shift
+over time due to changes in environmental factors, sensor characteristics, and
+task-of-interest. While it is possible to have a human-in-the-loop to monitor
+for distribution shifts and engineer new architectures in response to these
+shifts, such a setup is not cost-effective. Instead, non-stationary automated
+ML (AutoML) models are needed. This paper presents the
+Encoder-Adaptor-Reconfigurator (EAR) framework for efficient continual learning
+under domain shifts. The EAR framework uses a fixed deep neural network (DNN)
+feature encoder and trains shallow networks on top of the encoder to handle
+novel data. The EAR framework is capable of 1) detecting when new data is
+out-of-distribution (OOD) by combining DNNs with hyperdimensional computing
+(HDC), 2) identifying low-parameter neural adaptors to adapt the model to the
+OOD data using zero-shot neural architecture search (ZS-NAS), and 3) minimizing
+catastrophic forgetting on previous tasks by progressively growing the neural
+architecture as needed and dynamically routing data through the appropriate
+adaptors and reconfigurators for handling domain-incremental and
+class-incremental continual learning. We systematically evaluate our approach
+on several benchmark datasets for domain adaptation and demonstrate strong
+performance compared to state-of-the-art algorithms for OOD detection and
+few-/zero-shot NAS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review w/ IEEE Transactions on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Task Interference in Multi-Task Learning via Explicit Task
+  Routing with Non-Learnable Primitives <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuntao Ding, Zhichao Lu, Shangguang Wang, Ran Cheng, Vishnu Naresh Boddeti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-task learning (MTL) seeks to learn a single model to accomplish
+multiple tasks by leveraging shared information among the tasks. Existing MTL
+models, however, have been known to suffer from negative interference among
+tasks. Efforts to mitigate task interference have focused on either
+loss/gradient balancing or implicit parameter partitioning with partial
+overlaps among the tasks. In this paper, we propose ETR-NLP to mitigate task
+interference through a synergistic combination of non-learnable primitives
+(NLPs) and explicit task routing (ETR). Our key idea is to employ non-learnable
+primitives to extract a diverse set of task-agnostic features and recombine
+them into a shared branch common to all tasks and explicit task-specific
+branches reserved for each task. The non-learnable primitives and the explicit
+decoupling of learnable parameters into shared and task-specific ones afford
+the flexibility needed for minimizing task interference. We evaluate the
+efficacy of ETR-NLP networks for both image-level classification and
+pixel-level dense prediction MTL problems. Experimental results indicate that
+ETR-NLP significantly outperforms state-of-the-art baselines with fewer
+learnable parameters and similar FLOPs across all datasets. Code is available
+at this \href{https://github.com/zhichao-lu/etr-nlp-mtl}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Biometric Capacity of Generative Face Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishnu Naresh Boddeti, Gautam Sreekumar, Arun Ross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been tremendous progress in generating realistic faces with high
+fidelity over the past few years. Despite this progress, a crucial question
+remains unanswered: "Given a generative face model, how many unique identities
+can it generate?" In other words, what is the biometric capacity of the
+generative face model? A scientific basis for answering this question will
+benefit evaluating and comparing different generative face models and establish
+an upper bound on their scalability. This paper proposes a statistical approach
+to estimate the biometric capacity of generated face images in a hyperspherical
+feature space. We employ our approach on multiple generative models, including
+unconditional generators like StyleGAN, Latent Diffusion Model, and "Generated
+Photos," as well as DCFace, a class-conditional generator. We also estimate
+capacity w.r.t. demographic attributes such as gender and age. Our capacity
+estimates indicate that (a) under ArcFace representation at a false acceptance
+rate (FAR) of 0.1%, StyleGAN3 and DCFace have a capacity upper bound of
+$1.43\times10^6$ and $1.190\times10^4$, respectively; (b) the capacity reduces
+drastically as we lower the desired FAR with an estimate of $1.796\times10^4$
+and $562$ at FAR of 1% and 10%, respectively, for StyleGAN3; (c) there is no
+discernible disparity in the capacity w.r.t gender; and (d) for some generative
+models, there is an appreciable disparity in the capacity w.r.t age. Code is
+available at https://github.com/human-analysis/capacity-generative-face-models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models for Counterfactual Generation and Anomaly Detection in
+  Brain Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Fontanella, Grant Mair, Joanna Wardlaw, Emanuele Trucco, Amos Storkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation masks of pathological areas are useful in many medical
+applications, such as brain tumour and stroke management. Moreover, healthy
+counterfactuals of diseased images can be used to enhance radiologists'
+training files and to improve the interpretability of segmentation models. In
+this work, we present a weakly supervised method to generate a healthy version
+of a diseased image and then use it to obtain a pixel-wise anomaly map. To do
+so, we start by considering a saliency map that approximately covers the
+pathological areas, obtained with ACAT. Then, we propose a technique that
+allows to perform targeted modifications to these regions, while preserving the
+rest of the image. In particular, we employ a diffusion model trained on
+healthy samples and combine Denoising Diffusion Probabilistic Model (DDPM) and
+Denoising Diffusion Implicit Model (DDIM) at each step of the sampling process.
+DDPM is used to modify the areas affected by a lesion within the saliency map,
+while DDIM guarantees reconstruction of the normal anatomy outside of it. The
+two parts are also fused at each timestep, to guarantee the generation of a
+sample with a coherent appearance and a seamless transition between edited and
+unedited parts. We verify that when our method is applied to healthy samples,
+the input images are reconstructed without significant modifications. We
+compare our approach with alternative weakly supervised methods on IST-3 for
+stroke lesion segmentation and on BraTS2021 for brain tumour segmentation,
+where we improve the DICE score of the best competing method from $0.6534$ to
+$0.7056$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UGainS: Uncertainty Guided Anomaly Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexey Nekrasov, Alexander Hermans, Lars Kuhnert, Bastian Leibe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A single unexpected object on the road can cause an accident or may lead to
+injuries. To prevent this, we need a reliable mechanism for finding anomalous
+objects on the road. This task, called anomaly segmentation, can be a stepping
+stone to safe and reliable autonomous driving. Current approaches tackle
+anomaly segmentation by assigning an anomaly score to each pixel and by
+grouping anomalous regions using simple heuristics. However, pixel grouping is
+a limiting factor when it comes to evaluating the segmentation performance of
+individual anomalous objects. To address the issue of grouping multiple anomaly
+instances into one, we propose an approach that produces accurate anomaly
+instance masks. Our approach centers on an out-of-distribution segmentation
+model for identifying uncertain regions and a strong generalist segmentation
+model for anomaly instances segmentation. We investigate ways to use uncertain
+regions to guide such a segmentation model to perform segmentation of anomalous
+instances. By incorporating strong object priors from a generalist model we
+additionally improve the per-pixel anomaly segmentation performance. Our
+approach outperforms current pixel-level anomaly segmentation methods,
+achieving an AP of 80.08% and 88.98% on the Fishyscapes Lost and Found and the
+RoadAnomaly validation sets respectively. Project page:
+https://vision.rwth-aachen.de/ugains
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at GCPR 2023; Project page at
+  https://vision.rwth-aachen.de/ugains</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ETran: Energy-Based Transferability Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohsen Gholami, Mohammad Akbari, Xinglu Wang, Behnam Kamranian, Yong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of ranking pre-trained models for object
+detection and image classification. Selecting the best pre-trained model by
+fine-tuning is an expensive and time-consuming task. Previous works have
+proposed transferability estimation based on features extracted by the
+pre-trained models. We argue that quantifying whether the target dataset is
+in-distribution (IND) or out-of-distribution (OOD) for the pre-trained model is
+an important factor in the transferability estimation. To this end, we propose
+ETran, an energy-based transferability assessment metric, which includes three
+scores: 1) energy score, 2) classification score, and 3) regression score. We
+use energy-based models to determine whether the target dataset is OOD or IND
+for the pre-trained model. In contrast to the prior works, ETran is applicable
+to a wide range of tasks including classification, regression, and object
+detection (classification+regression). This is the first work that proposes
+transferability estimation for object detection task. Our extensive experiments
+on four benchmarks and two tasks show that ETran outperforms previous works on
+object detection and classification benchmarks by an average of 21% and 12%,
+respectively, and achieves SOTA in transferability assessment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Transition from Neural Representation to Symbolic Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyan Cheng, Peter Chin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bridging the huge disparity between neural and symbolic representation can
+potentially enable the incorporation of symbolic thinking into neural networks
+from essence. Motivated by how human gradually builds complex symbolic
+representation from the prototype symbols that are learned through perception
+and environmental interactions. We propose a Neural-Symbolic Transitional
+Dictionary Learning (TDL) framework that employs an EM algorithm to learn a
+transitional representation of data that compresses high-dimension information
+of visual parts of an input into a set of tensors as neural variables and
+discover the implicit predicate structure in a self-supervised way. We
+implement the framework with a diffusion model by regarding the decomposition
+of input as a cooperative game, then learn predicates by prototype clustering.
+We additionally use RL enabled by the Markovian of diffusion models to further
+tune the learned prototypes by incorporating subjective factors. Extensive
+experiments on 3 abstract compositional visual objects datasets that require
+the model to segment parts without any visual features like texture, color, or
+shadows apart from shape and 3 neural/symbolic downstream tasks demonstrate the
+learned representation enables interpretable decomposition of visual input and
+smooth adaption to downstream tasks which are not available by existing
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable unsupervised multi-modal image registration using deep
+  networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengjia Wang, Giorgos Papanastasiou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical decision making from magnetic resonance imaging (MRI) combines
+complementary information from multiple MRI sequences (defined as
+'modalities'). MRI image registration aims to geometrically 'pair' diagnoses
+from different modalities, time points and slices. Both intra- and
+inter-modality MRI registration are essential components in clinical MRI
+settings. Further, an MRI image processing pipeline that can address both afine
+and non-rigid registration is critical, as both types of deformations may be
+occuring in real MRI data scenarios. Unlike image classification,
+explainability is not commonly addressed in image registration deep learning
+(DL) methods, as it is challenging to interpet model-data behaviours against
+transformation fields. To properly address this, we incorporate Grad-CAM-based
+explainability frameworks in each major component of our unsupervised
+multi-modal and multi-organ image registration DL methodology. We previously
+demonstrated that we were able to reach superior performance (against the
+current standard Syn method). In this work, we show that our DL model becomes
+fully explainable, setting the framework to generalise our approach on further
+medical imaging data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Ki67, ER, PR, and HER2 Statuses from H&E-stained Breast
+  Cancer Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Akbarnejad, Nilanjan Ray, Penny J. Barnes, Gilbert Bigras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advances in machine learning and digital pathology, it is not yet
+clear if machine learning methods can accurately predict molecular information
+merely from histomorphology. In a quest to answer this question, we built a
+large-scale dataset (185538 images) with reliable measurements for Ki67, ER,
+PR, and HER2 statuses. The dataset is composed of mirrored images of H\&E and
+corresponding images of immunohistochemistry (IHC) assays (Ki67, ER, PR, and
+HER2. These images are mirrored through registration. To increase reliability,
+individual pairs were inspected and discarded if artifacts were present (tissue
+folding, bubbles, etc). Measurements for Ki67, ER and PR were determined by
+calculating H-Score from image analysis. HER2 measurement is based on binary
+classification: 0 and 1+ (IHC scores representing a negative subset) vs 3+ (IHC
+score positive subset). Cases with IHC equivocal score (2+) were excluded. We
+show that a standard ViT-based pipeline can achieve prediction performances
+around 90% in terms of Area Under the Curve (AUC) when trained with a proper
+labeling protocol. Finally, we shed light on the ability of the trained
+classifiers to localize relevant regions, which encourages future work to
+improve the localizations. Our proposed dataset is publicly available:
+https://ihc4bc.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CartiMorph: a framework for automated knee articular cartilage
+  morphometrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcheng Yao, Junru Zhong, Liping Zhang, Sheheryar Khan, Weitian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce CartiMorph, a framework for automated knee articular cartilage
+morphometrics. It takes an image as input and generates quantitative metrics
+for cartilage subregions, including the percentage of full-thickness cartilage
+loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the
+power of deep learning models for hierarchical image feature representation.
+Deep learning models were trained and validated for tissue segmentation,
+template construction, and template-to-image registration. We established
+methods for surface-normal-based cartilage thickness mapping, FCL estimation,
+and rule-based cartilage parcellation. Our cartilage thickness map showed less
+error in thin and peripheral regions. We evaluated the effectiveness of the
+adopted segmentation model by comparing the quantitative metrics obtained from
+model segmentation and those from manual segmentation. The root-mean-squared
+deviation of the FCL measurements was less than 8%, and strong correlations
+were observed for the mean thickness (Pearson's correlation coefficient $\rho
+\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in
+[0.89,0.98]$) measurements. We compared our FCL measurements with those from a
+previous study and found that our measurements deviated less from the ground
+truths. We observed superior performance of the proposed rule-based cartilage
+parcellation method compared with the atlas-based approach. CartiMorph has the
+potential to promote imaging biomarkers discovery for knee osteoarthritis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Medical Image Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RealCQA: Scientific Chart Question Answering as a Test-bed for
+  First-Order Logic <span class="chip">ICDAR '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saleem Ahmed, Bhavin Jawade, Shubham Pandey, Srirangaraj Setlur, Venu Govindaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a comprehensive study of chart visual question-answering(QA) task,
+to address the challenges faced in comprehending and extracting data from chart
+visualizations within documents. Despite efforts to tackle this problem using
+synthetic charts, solutions are limited by the shortage of annotated real-world
+data. To fill this gap, we introduce a benchmark and dataset for chart visual
+QA on real-world charts, offering a systematic analysis of the task and a novel
+taxonomy for template-based chart question creation. Our contribution includes
+the introduction of a new answer type, 'list', with both ranked and unranked
+variations. Our study is conducted on a real-world chart dataset from
+scientific literature, showcasing higher visual complexity compared to other
+works. Our focus is on template-based QA and how it can serve as a standard for
+evaluating the first-order logic capabilities of models. The results of our
+experiments, conducted on a real-world out-of-distribution dataset, provide a
+robust evaluation of large-scale pre-trained models and advance the field of
+chart visual QA and formal logic verification for neural networks in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This a pre-print version. Accepted at ICDAR '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpaDen : Sparse and Dense Keypoint Estimation for Real-World Chart
+  Understanding <span class="chip">ICDAR 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saleem Ahmed, Pengyu Yan, David Doermann, Srirangaraj Setlur, Venu Govindaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel bottom-up approach for the extraction of chart data. Our
+model utilizes images of charts as inputs and learns to detect keypoints (KP),
+which are used to reconstruct the components within the plot area. Our novelty
+lies in detecting a fusion of continuous and discrete KP as predicted heatmaps.
+A combination of sparse and dense per-pixel objectives coupled with a uni-modal
+self-attention-based feature-fusion layer is applied to learn KP embeddings.
+Further leveraging deep metric learning for unsupervised clustering, allows us
+to segment the chart plot area into various objects. By further matching the
+chart components to the legend, we are able to obtain the data series names. A
+post-processing threshold is applied to the KP embeddings to refine the object
+reconstructions and improve accuracy. Our extensive experiments include an
+evaluation of different modules for KP estimation and the combination of deep
+layer aggregation and corner pooling approaches. The results of our experiments
+provide extensive evaluation for the task of real-world chart data extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted ORAL at ICDAR 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RegionBLIP: A Unified Multi-modal <span class="highlight-title">Pre-train</span>ing Framework for Holistic
+  and Regional Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Zhou, Chaohui Yu, Shaofeng Zhang, Sitong Wu, Zhibing Wang, Fan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate extending the comprehension of Multi-modal Large
+Language Models (MLLMs) to regional objects. To this end, we propose to extract
+features corresponding to regional objects as soft prompts for LLM, which
+provides a straightforward and scalable approach and eliminates the need for
+LLM fine-tuning. To effectively extract regional features from regular image
+features and irregular point cloud features, we present a novel and unified
+position-assisted feature extraction module. Furthermore, training an MLLM from
+scratch is highly time-consuming. Thus, we propose incrementally extending
+existing pre-trained MLLMs to comprehend more modalities and the regional
+objects of those modalities. Specifically, we freeze the Q-Former from BLIP-2,
+an impressive MLLM, and optimize the modality-specific Lora parameters in
+Q-Former and LLM for each newly introduced modality. The freezing of the
+Q-Former eliminates the need for extensive pre-training on massive image-text
+data. The freezed Q-Former pre-trained from massive image-text data is also
+beneficial for the pre-training on image-region-text data. We name our
+framework RegionBLIP. We pre-train RegionBLIP on image-region-text,
+point-cloud-text, and point-cloud-region-text data. Experimental results verify
+that \Ours{} can preserve the image comprehension capability of BILP-2 and
+further gain a comprehension of the newly introduced point cloud modality and
+regional objects. The Data, Code, and Pre-trained models will be available at
+https://github.com/mightyzau/RegionBLIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multidimensional Analysis of Social Biases in Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Brinkmann, Paul Swoboda, Christian Bartelt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The embedding spaces of image models have been shown to encode a range of
+social biases such as racism and sexism. Here, we investigate specific factors
+that contribute to the emergence of these biases in Vision Transformers (ViT).
+Therefore, we measure the impact of training data, model architecture, and
+training objectives on social biases in the learned representations of ViTs.
+Our findings indicate that counterfactual augmentation training using
+diffusion-based image editing can mitigate biases, but does not eliminate them.
+Moreover, we find that larger models are less biased than smaller models, and
+that models trained using discriminative objectives are less biased than those
+trained using generative objectives. In addition, we observe inconsistencies in
+the learned social biases. To our surprise, ViTs can exhibit opposite biases
+when trained on the same data set using different self-supervised objectives.
+Our findings give insights into the factors that contribute to the emergence of
+social biases and suggests that we could achieve substantial fairness
+improvements based on model design choices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Higher Chest X-ray Resolution Improves Classification Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Wollek, Sardi Hyska, Bastian Sabel, Michael Ingrisch, Tobias Lasser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models for image classification are often trained at a
+resolution of 224 x 224 pixels for historical and efficiency reasons. However,
+chest X-rays are acquired at a much higher resolution to display subtle
+pathologies. This study investigates the effect of training resolution on chest
+X-ray classification performance, using the chest X-ray 14 dataset. The results
+show that training with a higher image resolution, specifically 1024 x 1024
+pixels, results in the best overall classification performance with a mean AUC
+of 84.2 % compared to 82.7 % when trained with 256 x 256 pixel images.
+Additionally, comparison of bounding boxes and GradCAM saliency maps suggest
+that low resolutions, such as 256 x 256 pixels, are insufficient for
+identifying small pathologies and force the model to use spurious
+discriminating features. Our code is publicly available at
+https://gitlab.lrz.de/IP/cxr-resolution
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LISA: Reasoning Segmentation via Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00692v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00692v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lai, Zhuotao Tian, Yukang Chen, Yanwei Li, Yuhui Yuan, Shu Liu, Jiaya Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although perception systems have made remarkable advancements in recent
+years, they still rely on explicit human instruction to identify the target
+objects or categories before executing visual recognition tasks. Such systems
+lack the ability to actively reason and comprehend implicit user intentions. In
+this work, we propose a new segmentation task -- reasoning segmentation. The
+task is designed to output a segmentation mask given a complex and implicit
+query text. Furthermore, we establish a benchmark comprising over one thousand
+image-instruction pairs, incorporating intricate reasoning and world knowledge
+for evaluation purposes. Finally, we present LISA: large Language Instructed
+Segmentation Assistant, which inherits the language generation capabilities of
+the multi-modal Large Language Model (LLM) while also possessing the ability to
+produce segmentation masks. We expand the original vocabulary with a <SEG>
+token and propose the embedding-as-mask paradigm to unlock the segmentation
+capability. Remarkably, LISA can handle cases involving: 1) complex reasoning;
+2) world knowledge; 3) explanatory answers; 4) multi-turn conversation. Also,
+it demonstrates robust zero-shot capability when trained exclusively on
+reasoning-free datasets. In addition, fine-tuning the model with merely 239
+reasoning segmentation image-instruction pairs results in further performance
+enhancement. Experiments show our method not only unlocks new reasoning
+segmentation capabilities but also proves effective in both complex reasoning
+segmentation and standard referring segmentation tasks. Code, models, and demo
+are at https://github.com/dvlab-research/LISA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, models, and demo are available at
+  https://github.com/dvlab-research/LISA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Compositional Concepts Discovery with Text-to-Image
+  Generative Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Liu, Yilun Du, Shuang Li, Joshua B. Tenenbaum, Antonio Torralba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generative models have enabled high-resolution image synthesis
+across different domains, but require users to specify the content they wish to
+generate. In this paper, we consider the inverse problem -- given a collection
+of different images, can we discover the generative concepts that represent
+each image? We present an unsupervised approach to discover generative concepts
+from a collection of images, disentangling different art styles in paintings,
+objects, and lighting from kitchen scenes, and discovering image classes given
+ImageNet images. We show how such generative concepts can accurately represent
+the content of images, be recombined and composed to generate new artistic and
+hybrid images, and be further used as a representation for downstream
+classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project Webpage:
+  https://energy-based-model.github.io/unsupervised-concept-discovery/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Masked Diffusion Models Are Fast and Privacy-Aware Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11363v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11363v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Lei, Peng Cheng, Zhongjie Ba, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as the \emph{de-facto} technique for image
+generation, yet they entail significant computational overhead, hindering the
+technique's broader application in the research community. We propose a
+prior-based denoising training framework, the first to incorporate the
+pre-train and fine-tune paradigm into the diffusion model training process,
+which substantially improves training efficiency and shows potential in
+facilitating various downstream tasks. Our approach centers on masking a high
+proportion (e.g., up to 90\%) of the input image and employing masked denoising
+score matching to denoise the visible areas, thereby guiding the diffusion
+model to learn more salient features from training data as prior knowledge. By
+utilizing masked learning in a pre-training stage, we efficiently train the
+ViT-based diffusion model on CelebA-HQ $256 \times 256$ in the pixel space,
+achieving a 4x acceleration and enhancing the quality of generated images
+compared to denoising diffusion probabilistic model (DDPM). Moreover, our
+masked pre-training technique can be universally applied to various diffusion
+models that directly generate images in the pixel space, aiding in the learning
+of pre-trained models with superior generalizability. For instance, a diffusion
+model pre-trained on VGGFace2 attains a 46\% quality improvement through
+fine-tuning with merely 10\% data from a different distribution. Moreover, our
+method shows the potential to serve as a training paradigm for enhancing the
+privacy protection capabilities of diffusion models. Our code is available at
+\url{https://github.com/jiachenlei/maskdm}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WindowNet: Learnable Windows for Chest X-ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Wollek, Sardi Hyska, Bastian Sabel, Michael Ingrisch, Tobias Lasser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chest X-ray (CXR) images are commonly compressed to a lower resolution and
+bit depth to reduce their size, potentially altering subtle diagnostic
+features.
+  Radiologists use windowing operations to enhance image contrast, but the
+impact of such operations on CXR classification performance is unclear.
+  In this study, we show that windowing can improve CXR classification
+performance, and propose WindowNet, a model that learns optimal window
+settings.
+  We first investigate the impact of bit-depth on classification performance
+and find that a higher bit-depth (12-bit) leads to improved performance.
+  We then evaluate different windowing settings and show that training with a
+distinct window generally improves pathology-wise classification performance.
+  Finally, we propose and evaluate WindowNet, a model that learns optimal
+window settings, and show that it significantly improves performance compared
+to the baseline model without windowing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shehzaad Dhuliawala, Mrinmaya Sachan, Carl Allen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a latent variable generalisation of neural network softmax
+classification trained with cross-entropy loss, referred to as variational
+classification (VC). Our approach offers a novel probabilistic perspective on
+the highly familiar softmax classification model, to which it relates similarly
+to how variational and traditional autoencoders relate. We derive a training
+objective based on the evidence lower bound (ELBO) that is non-trivial to
+optimize, and therefore propose an adversarial approach to maximise it. We show
+that VC addresses an inherent inconsistency within softmax classification,
+whilst also allowing more flexible choices of prior distributions in the latent
+space in place of implicit assumptions revealed within off-the-shelf softmax
+classifiers. Empirical evaluation on image and text classification datasets
+demonstrates that variational classification maintains prediction accuracy
+while improving other desirable properties such as calibration and adversarial
+robustness, particularly under distribution shift and low data settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Learning Approach for Virtual Contrast Enhancement in Contrast
+  Enhanced Spectral Mammography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00471v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00471v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aurora Rofena, Valerio Guarrasi, Marina Sarli, Claudia Lucia Piccolo, Matteo Sammarra, Bruno Beomonte Zobel, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrast Enhanced Spectral Mammography (CESM) is a dual-energy mammographic
+imaging technique that first needs intravenously administration of an iodinated
+contrast medium; then, it collects both a low-energy image, comparable to
+standard mammography, and a high-energy image. The two scans are then combined
+to get a recombined image showing contrast enhancement. Despite CESM diagnostic
+advantages for breast cancer diagnosis, the use of contrast medium can cause
+side effects, and CESM also beams patients with a higher radiation dose
+compared to standard mammography. To address these limitations this work
+proposes to use deep generative models for virtual contrast enhancement on
+CESM, aiming to make the CESM contrast-free as well as to reduce the radiation
+dose. Our deep networks, consisting of an autoencoder and two Generative
+Adversarial Networks, the Pix2Pix, and the CycleGAN, generate synthetic
+recombined images solely from low-energy images. We perform an extensive
+quantitative and qualitative analysis of the model's performance, also
+exploiting radiologists' assessments, on a novel CESM dataset that includes
+1138 images that, as a further contribution of this work, we make publicly
+available. The results show that CycleGAN is the most promising deep network to
+generate synthetic recombined images, highlighting the potential of artificial
+intelligence techniques for virtual contrast enhancement in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancement of Novel View Synthesis Using Omnidirectional Image
+  Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.09957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.09957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takayuki Hara, Tatsuya Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we present a method for synthesizing novel views from a single
+360-degree RGB-D image based on the neural radiance field (NeRF) . Prior
+studies relied on the neighborhood interpolation capability of multi-layer
+perceptrons to complete missing regions caused by occlusion and zooming, which
+leads to artifacts. In the method proposed in this study, the input image is
+reprojected to 360-degree RGB images at other camera positions, the missing
+regions of the reprojected images are completed by a 2D image generative model,
+and the completed images are utilized to train the NeRF. Because multiple
+completed images contain inconsistencies in 3D, we introduce a method to learn
+the NeRF model using a subset of completed images that cover the target scene
+with less overlap of completed regions. The selection of such a subset of
+images can be attributed to the maximum weight independent set problem, which
+is solved through simulated annealing. Experiments demonstrated that the
+proposed method can synthesize plausible novel views while preserving the
+features of the scene for both artificial and real-world data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ atTRACTive: Semi-automatic white matter tract segmentation using active
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18905v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18905v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Peretzke, Klaus Maier-Hein, Jonas Bohn, Yannick Kirchhoff, Saikat Roy, Sabrina Oberli-Palma, Daniela Becker, Pavlina Lenga, Peter Neher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately identifying white matter tracts in medical images is essential for
+various applications, including surgery planning and tract-specific analysis.
+Supervised machine learning models have reached state-of-the-art solving this
+task automatically. However, these models are primarily trained on healthy
+subjects and struggle with strong anatomical aberrations, e.g. caused by brain
+tumors. This limitation makes them unsuitable for tasks such as preoperative
+planning, wherefore time-consuming and challenging manual delineation of the
+target tract is typically employed. We propose semi-automatic entropy-based
+active learning for quick and intuitive segmentation of white matter tracts
+from whole-brain tractography consisting of millions of streamlines. The method
+is evaluated on 21 openly available healthy subjects from the Human Connectome
+Project and an internal dataset of ten neurosurgical cases. With only a few
+annotations, the proposed approach enables segmenting tracts on tumor cases
+comparable to healthy subjects (dice=0.71), while the performance of automatic
+methods, like TractSeg dropped substantially (dice=0.34) in comparison to
+healthy subjects. The method is implemented as a prototype named atTRACTive in
+the freely available software MITK Diffusion. Manual experiments on tumor data
+showed higher efficiency due to lower segmentation times compared to
+traditional ROI-based segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relational Experience Replay: Continual Learning by Adaptively Tuning
+  Task-wise Relationship 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.15402v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.15402v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanziang Wang, Renzhen Wang, Yuexiang Li, Dong Wei, Kai Ma, Yefeng Zheng, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning is a promising machine learning paradigm to learn new
+tasks while retaining previously learned knowledge over streaming training
+data. Till now, rehearsal-based methods, keeping a small part of data from old
+tasks as a memory buffer, have shown good performance in mitigating
+catastrophic forgetting for previously learned knowledge. However, most of
+these methods typically treat each new task equally, which may not adequately
+consider the relationship or similarity between old and new tasks. Furthermore,
+these methods commonly neglect sample importance in the continual training
+process and result in sub-optimal performance on certain tasks. To address this
+challenging problem, we propose Relational Experience Replay (RER), a bi-level
+learning framework, to adaptively tune task-wise relationships and sample
+importance within each task to achieve a better `stability' and `plasticity'
+trade-off. As such, the proposed method is capable of accumulating new
+knowledge while consolidating previously learned old knowledge during continual
+learning. Extensive experiments conducted on three publicly available datasets
+(i.e., CIFAR-10, CIFAR-100, and Tiny ImageNet) show that the proposed method
+can consistently improve the performance of all baselines and surpass current
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13501v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13501v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Morelli, Alberto Baldrati, Giuseppe Cartella, Marcella Cornia, Marco Bertini, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapidly evolving fields of e-commerce and metaverse continue to seek
+innovative approaches to enhance the consumer experience. At the same time,
+recent advancements in the development of diffusion models have enabled
+generative networks to create remarkably realistic images. In this context,
+image-based virtual try-on, which consists in generating a novel image of a
+target model wearing a given in-shop garment, has yet to capitalize on the
+potential of these powerful generative solutions. This work introduces
+LaDI-VTON, the first Latent Diffusion textual Inversion-enhanced model for the
+Virtual Try-ON task. The proposed architecture relies on a latent diffusion
+model extended with a novel additional autoencoder module that exploits
+learnable skip connections to enhance the generation process preserving the
+model's characteristics. To effectively maintain the texture and details of the
+in-shop garment, we propose a textual inversion component that can map the
+visual features of the garment to the CLIP token embedding space and thus
+generate a set of pseudo-word token embeddings capable of conditioning the
+generation process. Experimental results on Dress Code and VITON-HD datasets
+demonstrate that our approach outperforms the competitors by a consistent
+margin, achieving a significant milestone for the task. Source code and trained
+models are publicly available at: https://github.com/miccunifi/ladi-vton.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatically Predict Material Properties with Microscopic Image Example
+  Polymer Compatibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhilong Liang, Zhenzhi Tan, Ruixin Hong, Wanli Ouyang, Jinying Yuan, Changshui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many material properties are manifested in the morphological appearance and
+characterized with microscopic image, such as scanning electron microscopy
+(SEM). Polymer miscibility is a key physical quantity of polymer material and
+commonly and intuitively judged by SEM images. However, human observation and
+judgement for the images is time-consuming, labor-intensive and hard to be
+quantified. Computer image recognition with machine learning method can make up
+the defects of artificial judging, giving accurate and quantitative judgement.
+We achieve automatic miscibility recognition utilizing convolution neural
+network and transfer learning method, and the model obtains up to 94% accuracy.
+We also put forward a quantitative criterion for polymer miscibility with this
+model. The proposed method can be widely applied to the quantitative
+characterization of the microstructure and properties of various materials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uni-Fusion: Universal Continuous Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijun Yuan, Andreas Nuechter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Uni-Fusion, a universal continuous mapping framework for surfaces,
+surface properties (color, infrared, etc.) and more (latent features in CLIP
+embedding space, etc.). We propose the first universal implicit encoding model
+that supports encoding of both geometry and different types of properties (RGB,
+infrared, features, etc.) without requiring any training. Based on this, our
+framework divides the point cloud into regular grid voxels and generates a
+latent feature in each voxel to form a Latent Implicit Map (LIM) for geometries
+and arbitrary properties. Then, by fusing a local LIM frame-wisely into a
+global LIM, an incremental reconstruction is achieved. Encoded with
+corresponding types of data, our Latent Implicit Map is capable of generating
+continuous surfaces, surface property fields, surface feature fields, and all
+other possible options. To demonstrate the capabilities of our model, we
+implement three applications: (1) incremental reconstruction for surfaces and
+color (2) 2D-to-3D transfer of fabricated properties (3) open-vocabulary scene
+understanding by creating a text CLIP feature field on surfaces. We evaluate
+Uni-Fusion by comparing it in corresponding applications, from which Uni-Fusion
+shows high-flexibility in various applications while performing best or being
+competitive. The project page of Uni-Fusion is available at
+https://jarrome.github.io/Uni-Fusion/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://jarrome.github.io/Uni-Fusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TemporalStereo: Efficient Spatial-Temporal Stereo Matching Network <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youmin Zhang, Matteo Poggi, Stefano Mattoccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present TemporalStereo, a coarse-to-fine stereo matching network that is
+highly efficient, and able to effectively exploit the past geometry and context
+information to boost matching accuracy. Our network leverages sparse cost
+volume and proves to be effective when a single stereo pair is given. However,
+its peculiar ability to use spatio-temporal information across stereo sequences
+allows TemporalStereo to alleviate problems such as occlusions and reflective
+regions while enjoying high efficiency also in this latter case. Notably, our
+model -- trained once with stereo videos -- can run in both single-pair and
+temporal modes seamlessly. Experiments show that our network relying on camera
+motion is robust even to dynamic objects when running on videos. We validate
+TemporalStereo through extensive experiments on synthetic (SceneFlow,
+TartanAir) and real (KITTI 2012, KITTI 2015) datasets. Our model achieves
+state-of-the-art performance on any of these datasets. Code is available at
+\url{https://github.com/youmi-zym/TemporalStereo.git}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IROS 2023, Project page:
+  https://youmi-zym.github.io/projects/TemporalStereo/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Approaches to Supervised Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10123v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10123v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Benatti, Luciano da F. Costa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though performed almost effortlessly by humans, segmenting 2D gray-scale or
+color images in terms of regions of interest (e.g.~background, objects, or
+portions of objects) constitutes one of the greatest challenges in science and
+technology as a consequence of the involved dimensionality reduction(3D to 2D),
+noise, reflections, shades, and occlusions, among many other possible effects.
+While a large number of interesting related approaches have been suggested
+along the last decades, it was mainly thanks to the recent development of deep
+learning that more effective and general solutions have been obtained,
+currently constituting the basic comparison reference for this type of
+operation. Also developed recently, a multiset-based methodology has been
+described that is capable of encouraging image segmentation performance while
+combining spatial accuracy, stability, and robustness while requiring little
+computational resources (hardware and/or training and recognition time). The
+interesting features of the multiset neurons methodology mostly follow from the
+enhanced selectivity and sensitivity, as well as good robustness to data
+perturbations and outliers, allowed by the coincidence similarity index on
+which the multiset approach to supervised image segmentation is based. After
+describing the deep learning and multiset neurons approaches, the present work
+develops two comparison experiments between them which are primarily aimed at
+illustrating their respective main interesting features when applied to the
+adopted specific type of data and parameter configurations. While the deep
+learning approach confirmed its potential for performing image segmentation,
+the alternative multiset methodology allowed for enhanced accuracy while
+requiring little computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CNOS: A Strong Baseline for CAD-based Novel Object Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Nguyen Nguyen, Tomas Hodan, Georgy Ponimatkin, Thibault Groueix, Vincent Lepetit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a simple three-stage approach to segment unseen objects in RGB
+images using their CAD models. Leveraging recent powerful foundation models,
+DINOv2 and Segment Anything, we create descriptors and generate proposals,
+including binary masks for a given input RGB image. By matching proposals with
+reference descriptors created from CAD models, we achieve precise object ID
+assignment along with modal masks. We experimentally demonstrate that our
+method achieves state-of-the-art results in CAD-based novel object
+segmentation, surpassing existing approaches on the seven core datasets of the
+BOP challenge by 19.8% AP using the same BOP evaluation protocol. Our source
+code is available at https://github.com/nv-nguyen/cnos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Masked Face Classification Benchmark on Low-Resolution Surveillance
+  Images <span class="chip">ICPR 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Cunico, Andrea Toaiari, Marco Cristani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel image dataset focused on tiny faces wearing face masks for
+mask classification purposes, dubbed Small Face MASK (SF-MASK), composed of a
+collection made from 20k low-resolution images exported from diverse and
+heterogeneous datasets, ranging from 7 x 7 to 64 x 64 pixel resolution. An
+accurate visualization of this collection, through counting grids, made it
+possible to highlight gaps in the variety of poses assumed by the heads of the
+pedestrians. In particular, faces filmed by very high cameras, in which the
+facial features appear strongly skewed, are absent. To address this structural
+deficiency, we produced a set of synthetic images which resulted in a
+satisfactory covering of the intra-class variance. Furthermore, a small
+subsample of 1701 images contains badly worn face masks, opening to multi-class
+classification challenges. Experiments on SF-MASK focus on face mask
+classification using several classifiers. Results show that the richness of
+SF-MASK (real + synthetic images) leads all of the tested classifiers to
+perform better than exploiting comparative face mask datasets, on a fixed 1077
+images testing set. Dataset and evaluation code are publicly available here:
+https://github.com/HumaticsLAB/sf-mask
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures. Accepted at T-CAP workshop @ ICPR 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in
+  Patient With Suspected Ischemic Stroke 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08757v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08757v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Tomasetti, Kjersti Engan, Liv Jorunn Høllesli, Kathinka Dæhli Kurz, Mahdieh Khanmohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise and fast prediction methods for ischemic areas comprised of dead
+tissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)
+patients are of significant clinical interest. They play an essential role in
+improving diagnosis and treatment planning. Computed Tomography (CT) scan is
+one of the primary modalities for early assessment in patients with suspected
+AIS. CT Perfusion (CTP) is often used as a primary assessment to determine
+stroke location, severity, and volume of ischemic lesions. Current automatic
+segmentation methods for CTP mostly use already processed 3D parametric maps
+conventionally used for clinical interpretation by radiologists as input.
+Alternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time
+input, where the spatial information over the volume is ignored. In addition,
+these methods are only interested in segmenting core regions, while predicting
+penumbra can be essential for treatment planning. This paper investigates
+different methods to utilize the entire 4D CTP as input to fully exploit the
+spatio-temporal information, leading us to propose a novel 4D convolution
+layer. Our comprehensive experiments on a local dataset of 152 patients divided
+into three groups show that our proposed models generate more precise results
+than other methods explored. Adopting the proposed 4D mJ-Net, a Dice
+Coefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core
+areas, respectively. The code is available on
+https://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auxiliary Cross-Modal Representation Learning with Triplet Loss
+  Functions for Online Handwriting Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.07901v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.07901v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Ott, David Rügamer, Lucas Heublein, Bernd Bischl, Christopher Mutschler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal representation learning learns a shared embedding between two or
+more modalities to improve performance in a given task compared to using only
+one of the modalities. Cross-modal representation learning from different data
+types -- such as images and time-series data (e.g., audio or text data) --
+requires a deep metric learning loss that minimizes the distance between the
+modality embeddings. In this paper, we propose to use the contrastive or
+triplet loss, which uses positive and negative identities to create sample
+pairs with different labels, for cross-modal representation learning between
+image and time-series modalities (CMR-IS). By adapting the triplet loss for
+cross-modal representation learning, higher accuracy in the main (time-series
+classification) task can be achieved by exploiting additional information of
+the auxiliary (image classification) task. We present a triplet loss with a
+dynamic margin for single label and sequence-to-sequence classification tasks.
+We perform extensive evaluations on synthetic image and time-series data, and
+on data for offline handwriting recognition (HWR) and on online HWR from
+sensor-enhanced pens for classifying written words. Our experiments show an
+improved classification accuracy, faster convergence, and better
+generalizability due to an improved cross-modal representation. Furthermore,
+the more suitable generalizability leads to a better adaptability between
+writers for online HWR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstructing Pruned Filters using Cheap Spatial Transformations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.12844v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.12844v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roy Miles, Krystian Mikolajczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an efficient alternative to the convolutional layer using cheap
+spatial transformations. This construction exploits an inherent spatial
+redundancy of the learned convolutional filters to enable a much greater
+parameter efficiency, while maintaining the top-end accuracy of their dense
+counter-parts. Training these networks is modelled as a generalised pruning
+problem, whereby the pruned filters are replaced with cheap transformations
+from the set of non-pruned filters. We provide an efficient implementation of
+the proposed layer, followed by two natural extensions to avoid excessive
+feature compression and to improve the expressivity of the transformed
+features. We show that these networks can achieve comparable or improved
+performance to state-of-the-art pruning models across both the CIFAR-10 and
+ImageNet-1K datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Multi-level Features for Very High Resolution Remote Sensing
+  Scene Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00679v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00679v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiranjibi Sitaula, Sumesh KC, Jagannath Aryal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Very high-resolution (VHR) remote sensing (RS) scene classification is a
+challenging task due to the higher inter-class similarity and intra-class
+variability problems. Recently, the existing deep learning (DL)-based methods
+have shown great promise in VHR RS scene classification. However, they still
+provide an unstable classification performance. To address such a problem, we,
+in this letter, propose a novel DL-based approach. For this, we devise an
+enhanced VHR attention module (EAM), followed by the atrous spatial pyramid
+pooling (ASPP) and global average pooling (GAP). This procedure imparts the
+enhanced features from the corresponding level. Then, the multi-level feature
+fusion is performed. Experimental results on two widely-used VHR RS datasets
+show that the proposed approach yields a competitive and stable/robust
+classification performance with the least standard deviation of 0.001. Further,
+the highest overall accuracies on the AID and the NWPU datasets are 95.39% and
+93.04%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to the journal for peer review. Based
+  on the journal's policy and restrictions, this version may be updated or
+  deleted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLCA: Slow Learner with Classifier Alignment for Continual Learning on a
+  <span class="highlight-title">Pre-train</span>ed Model <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05118v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05118v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gengwei Zhang, Liyuan Wang, Guoliang Kang, Ling Chen, Yunchao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of continual learning is to improve the performance of recognition
+models in learning sequentially arrived data. Although most existing works are
+established on the premise of learning from scratch, growing efforts have been
+devoted to incorporating the benefits of pre-training. However, how to
+adaptively exploit the pre-trained knowledge for each incremental task while
+maintaining its generalizability remains an open question. In this work, we
+present an extensive analysis for continual learning on a pre-trained model
+(CLPM), and attribute the key challenge to a progressive overfitting problem.
+Observing that selectively reducing the learning rate can almost resolve this
+issue in the representation layer, we propose a simple but extremely effective
+approach named Slow Learner with Classifier Alignment (SLCA), which further
+improves the classification layer by modeling the class-wise distributions and
+aligning the classification layers in a post-hoc fashion. Across a variety of
+scenarios, our proposal provides substantial improvements for CLPM (e.g., up to
+49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split
+CUB-200 and Split Cars-196, respectively), and thus outperforms
+state-of-the-art approaches by a large margin. Based on such a strong baseline,
+critical factors and promising directions are analyzed in-depth to facilitate
+subsequent research. Code has been made available at:
+https://github.com/GengDavid/SLCA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, code released</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BEV<span class="highlight-title">Bert</span>: Multimodal Map <span class="highlight-title">Pre-train</span>ing for Language-guided Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.04385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.04385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong An, Yuankai Qi, Yangguang Li, Yan Huang, Liang Wang, Tieniu Tan, Jing Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-training has shown promising results on the
+vision-and-language navigation (VLN) task. However, most existing pre-training
+methods employ discrete panoramas to learn visual-textual associations. This
+requires the model to implicitly correlate incomplete, duplicate observations
+within the panoramas, which may impair an agent's spatial understanding. Thus,
+we propose a new map-based pre-training paradigm that is spatial-aware for use
+in VLN. Concretely, we build a local metric map to explicitly aggregate
+incomplete observations and remove duplicates, while modeling navigation
+dependency in a global topological map. This hybrid design can balance the
+demand of VLN for both short-term reasoning and long-term planning. Then, based
+on the hybrid map, we devise a pre-training framework to learn a multimodal map
+representation, which enhances spatial-aware cross-modal reasoning thereby
+facilitating the language-guided navigation goal. Extensive experiments
+demonstrate the effectiveness of the map-based pre-training route for VLN, and
+the proposed method achieves state-of-the-art on four VLN benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, project page: https://github.com/MarSaKi/VLN-BEVBert</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by
+  Using Diffusion Model with ControlNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Hu, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, diffusion models like StableDiffusion have achieved impressive
+image generation results. However, the generation process of such diffusion
+models is uncontrollable, which makes it hard to generate videos with
+continuous and consistent content. In this work, by using the diffusion model
+with ControlNet, we proposed a new motion-guided video-to-video translation
+framework called VideoControlNet to generate various videos based on the given
+prompts and the condition from the input video. Inspired by the video codecs
+that use motion information for reducing temporal redundancy, our framework
+uses motion information to prevent the regeneration of the redundant areas for
+content consistency. Specifically, we generate the first frame (i.e., the
+I-frame) by using the diffusion model with ControlNet. Then we generate other
+key frames (i.e., the P-frame) based on the previous I/P-frame by using our
+newly proposed motion-guided P-frame generation (MgPG) method, in which the
+P-frames are generated based on the motion information and the occlusion areas
+are inpainted by using the diffusion model. Finally, the rest frames (i.e., the
+B-frame) are generated by using our motion-guided B-frame interpolation (MgBI)
+module. Our experiments demonstrate that our proposed VideoControlNet inherits
+the generation capability of the pre-trained large diffusion model and extends
+the image diffusion model to the video diffusion model by using motion
+information. More results are provided at our project page.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable In-the-Wild Video Quality Assessment: A Database and
+  a Language-<span class="highlight-title">Prompt</span>ed Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoning Wu, Erli Zhang, Liang Liao, Chaofeng Chen, Jingwen Hou, Annan Wang, Wenxiu Sun, Qiong Yan, Weisi Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of in-the-wild videos has greatly expanded the Video
+Quality Assessment (VQA) problem. Unlike early definitions that usually focus
+on limited distortion types, VQA on in-the-wild videos is especially
+challenging as it could be affected by complicated factors, including various
+distortions and diverse contents. Though subjective studies have collected
+overall quality scores for these videos, how the abstract quality scores relate
+with specific factors is still obscure, hindering VQA methods from more
+concrete quality evaluations (e.g. sharpness of a video). To solve this
+problem, we collect over two million opinions on 4,543 in-the-wild videos on 13
+dimensions of quality-related factors, including in-capture authentic
+distortions (e.g. motion blur, noise, flicker), errors introduced by
+compression and transmission, and higher-level experiences on semantic contents
+and aesthetic issues (e.g. composition, camera trajectory), to establish the
+multi-dimensional Maxwell database. Specifically, we ask the subjects to label
+among a positive, a negative, and a neutral choice for each dimension. These
+explanation-level opinions allow us to measure the relationships between
+specific quality factors and abstract subjective quality ratings, and to
+benchmark different categories of VQA algorithms on each dimension, so as to
+more comprehensively analyze their strengths and weaknesses. Furthermore, we
+propose the MaxVQA, a language-prompted VQA approach that modifies
+vision-language foundation model CLIP to better capture important quality
+issues as observed in our analyses. The MaxVQA can jointly evaluate various
+specific quality factors and final quality scores with state-of-the-art
+accuracy on all dimensions, and superb generalization ability on existing
+datasets. Code and data available at https://github.com/VQAssessment/MaxVQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 31st ACM International Conference on Multimedia
+  (MM '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A closer look at the training dynamics of knowledge distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11098v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11098v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roy Miles, Krystian Mikolajczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we revisit the efficacy of knowledge distillation as a function
+matching and metric learning problem. In doing so we verify three important
+design decisions, namely the normalisation, soft maximum function, and
+projection layers as key ingredients. We theoretically show that the projector
+implicitly encodes information on past examples, enabling relational gradients
+for the student. We then show that the normalisation of representations is
+tightly coupled with the training dynamics of this projector, which can have a
+large impact on the students performance. Finally, we show that a simple soft
+maximum function can be used to address any significant capacity gap problems.
+Experimental results on various benchmark datasets demonstrate that using these
+insights can lead to superior or comparable performance to state-of-the-art
+knowledge distillation techniques, despite being much more computationally
+efficient. In particular, we obtain these results across image classification
+(CIFAR100 and ImageNet), object detection (COCO2017), and on more difficult
+distillation objectives, such as training data efficient transformers, whereby
+we attain a 77.2% top-1 accuracy with DeiT-Ti on ImageNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dual Stage Stylization Modulation for Domain Generalized Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09347v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09347v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Tjio, Ping Liu, Chee-Keong Kwoh, Joey Tianyi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining sufficient labeled data for training deep models is often
+challenging in real-life applications. To address this issue, we propose a
+novel solution for single-source domain generalized semantic segmentation.
+Recent approaches have explored data diversity enhancement using hallucination
+techniques. However, excessive hallucination can degrade performance,
+particularly for imbalanced datasets. As shown in our experiments, minority
+classes are more susceptible to performance reduction due to hallucination
+compared to majority classes. To tackle this challenge, we introduce a
+dual-stage Feature Transform (dFT) layer within the Adversarial Semantic
+Hallucination+ (ASH+) framework. The ASH+ framework performs a dual-stage
+manipulation of hallucination strength. By leveraging semantic information for
+each pixel, our approach adaptively adjusts the pixel-wise hallucination
+strength, thus providing fine-grained control over hallucination. We validate
+the effectiveness of our proposed method through comprehensive experiments on
+publicly available semantic segmentation benchmark datasets (Cityscapes and
+SYNTHIA). Quantitative and qualitative comparisons demonstrate that our
+approach is competitive with state-of-the-art methods for the Cityscapes
+dataset and surpasses existing solutions for the SYNTHIA dataset. Code for our
+framework will be made readily available to the research community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffuPose: Monocular 3D Human Pose Estimation via Denoising Diffusion
+  Probabilistic Model <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02796v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02796v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeongjun Choi, Dongseok Shim, H. Jin Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thanks to the development of 2D keypoint detectors, monocular 3D human pose
+estimation (HPE) via 2D-to-3D uplifting approaches have achieved remarkable
+improvements. Still, monocular 3D HPE is a challenging problem due to the
+inherent depth ambiguities and occlusions. To handle this problem, many
+previous works exploit temporal information to mitigate such difficulties.
+However, there are many real-world applications where frame sequences are not
+accessible. This paper focuses on reconstructing a 3D pose from a single 2D
+keypoint detection. Rather than exploiting temporal information, we alleviate
+the depth ambiguity by generating multiple 3D pose candidates which can be
+mapped to an identical 2D keypoint. We build a novel diffusion-based framework
+to effectively sample diverse 3D poses from an off-the-shelf 2D detector. By
+considering the correlation between human joints by replacing the conventional
+denoising U-Net with graph convolutional network, our approach accomplishes
+further performance improvements. We evaluate our method on the widely adopted
+Human3.6M and HumanEva-I datasets. Comprehensive experiments are conducted to
+prove the efficacy of the proposed method, and they confirm that our model
+outperforms state-of-the-art multi-hypothesis 3D HPE methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IROS 2023. First two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Clustering-guided Contrastive Fusion for Multi-view Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13726v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13726v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanzhou Ke, Guoqing Chao, Xiaoli Wang, Chenyang Xu, Yongqi Zhu, Yang Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past two decades have seen increasingly rapid advances in the field of
+multi-view representation learning due to it extracting useful information from
+diverse domains to facilitate the development of multi-view applications.
+However, the community faces two challenges: i) how to learn robust
+representations from a large amount of unlabeled data to against noise or
+incomplete views setting, and ii) how to balance view consistency and
+complementary for various downstream tasks. To this end, we utilize a deep
+fusion network to fuse view-specific representations into the view-common
+representation, extracting high-level semantics for obtaining robust
+representation. In addition, we employ a clustering task to guide the fusion
+network to prevent it from leading to trivial solutions. For balancing
+consistency and complementary, then, we design an asymmetrical contrastive
+strategy that aligns the view-common representation and each view-specific
+representation. These modules are incorporated into a unified method known as
+CLustering-guided cOntrastiVE fusioN (CLOVEN). We quantitatively and
+qualitatively evaluate the proposed method on five datasets, demonstrating that
+CLOVEN outperforms 11 competitive multi-view learning methods in clustering and
+classification. In the incomplete view scenario, our proposed method resists
+noise interference better than those of our competitors. Furthermore, the
+visualization analysis shows that CLOVEN can preserve the intrinsic structure
+of view-specific representation while also improving the compactness of
+view-commom representation. Our source code will be available soon at
+https://github.com/guanzhou-ke/cloven.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Residual Pattern Learning for Pixel-wise Out-of-Distribution Detection
+  in Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14512v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14512v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyuan Liu, Choubo Ding, Yu Tian, Guansong Pang, Vasileios Belagiannis, Ian Reid, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation models classify pixels into a set of known
+(``in-distribution'') visual classes. When deployed in an open world, the
+reliability of these models depends on their ability not only to classify
+in-distribution pixels but also to detect out-of-distribution (OoD) pixels.
+Historically, the poor OoD detection performance of these models has motivated
+the design of methods based on model re-training using synthetic training
+images that include OoD visual objects. Although successful, these re-trained
+methods have two issues: 1) their in-distribution segmentation accuracy may
+drop during re-training, and 2) their OoD detection accuracy does not
+generalise well to new contexts (e.g., country surroundings) outside the
+training set (e.g., city surroundings). In this paper, we mitigate these issues
+with: (i) a new residual pattern learning (RPL) module that assists the
+segmentation model to detect OoD pixels without affecting the inlier
+segmentation performance; and (ii) a novel context-robust contrastive learning
+(CoroCL) that enforces RPL to robustly detect OoD pixels among various
+contexts. Our approach improves by around 10\% FPR and 7\% AuPRC the previous
+state-of-the-art in Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly
+datasets. Our code is available at: https://github.com/yyliu01/RPL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 11 figures and it is a preprint version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S$^2$Contact: Graph-based Network for 3D Hand-Object Contact Estimation
+  with Semi-Supervised Learning <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.00874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.00874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tze Ho Elden Tse, Zhongqun Zhang, Kwang In Kim, Ales Leonardis, Feng Zheng, Hyung Jin Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent efforts in accurate 3D annotations in hand and object
+datasets, there still exist gaps in 3D hand and object reconstructions.
+Existing works leverage contact maps to refine inaccurate hand-object pose
+estimations and generate grasps given object models. However, they require
+explicit 3D supervision which is seldom available and therefore, are limited to
+constrained settings, e.g., where thermal cameras observe residual heat left on
+manipulated objects. In this paper, we propose a novel semi-supervised
+framework that allows us to learn contact from monocular images. Specifically,
+we leverage visual and geometric consistency constraints in large-scale
+datasets for generating pseudo-labels in semi-supervised learning and propose
+an efficient graph-based network to infer contact. Our semi-supervised learning
+framework achieves a favourable improvement over the existing supervised
+learning methods trained on data with `limited' annotations. Notably, our
+proposed model is able to achieve superior results with less than half the
+network parameters and memory access cost when compared with the commonly-used
+PointNet-based approach. We show benefits from using a contact map that rules
+hand-object interactions to produce more accurate reconstructions. We further
+demonstrate that training with pseudo-labels can extend contact map estimations
+to out-of-domain objects and generalise better across multiple datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ECCV 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EDMAE: An Efficient Decoupled Masked Autoencoder for Standard View
+  Identification in Pediatric Echocardiography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13869v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13869v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiman Liu, Xiaoxiang Han, Tongtong Liang, Bin Dong, Jiajun Yuan, Menghan Hu, Qiaohong Liu, Jiangang Chen, Qingli Li, Yuqi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the Efficient Decoupled Masked Autoencoder (EDMAE), a
+novel self-supervised method for recognizing standard views in pediatric
+echocardiography. EDMAE introduces a new proxy task based on the
+encoder-decoder structure. The EDMAE encoder is composed of a teacher and a
+student encoder. The teacher encoder extracts the potential representation of
+the masked image blocks, while the student encoder extracts the potential
+representation of the visible image blocks. The loss is calculated between the
+feature maps output by the two encoders to ensure consistency in the latent
+representations they extract. EDMAE uses pure convolution operations instead of
+the ViT structure in the MAE encoder. This improves training efficiency and
+convergence speed. EDMAE is pre-trained on a large-scale private dataset of
+pediatric echocardiography using self-supervised learning, and then fine-tuned
+for standard view recognition. The proposed method achieves high classification
+accuracy in 27 standard views of pediatric echocardiography. To further verify
+the effectiveness of the proposed method, the authors perform another
+downstream task of cardiac ultrasound segmentation on the public dataset CAMUS.
+The experimental results demonstrate that the proposed method outperforms some
+popular supervised and recent self-supervised methods, and is more competitive
+on different downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures, 8 tables, Published in Biomedical Signal
+  Processing and Control</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Implicit Object Reconstruction using Uncertainty-guided
+  Next-Best-View Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16739v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16739v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyu Yan, Jianheng Liu, Fengyu Quan, Haoyao Chen, Mengmeng Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Actively planning sensor views during object reconstruction is crucial for
+autonomous mobile robots. An effective method should be able to strike a
+balance between accuracy and efficiency. In this paper, we propose a seamless
+integration of the emerging implicit representation with the active
+reconstruction task. We build an implicit occupancy field as our geometry
+proxy. While training, the prior object bounding box is utilized as auxiliary
+information to generate clean and detailed reconstructions. To evaluate view
+uncertainty, we employ a sampling-based approach that directly extracts entropy
+from the reconstructed occupancy probability field as our measure of view
+information gain. This eliminates the need for additional uncertainty maps or
+learning. Unlike previous methods that compare view uncertainty within a finite
+set of candidates, we aim to find the next-best-view (NBV) on a continuous
+manifold. Leveraging the differentiability of the implicit representation, the
+NBV can be optimized directly by maximizing the view uncertainty using gradient
+descent. It significantly enhances the method's adaptability to different
+scenarios. Simulation and real-world experiments demonstrate that our approach
+effectively improves reconstruction accuracy and efficiency of view planning in
+active reconstruction tasks. The proposed system will open source at
+https://github.com/HITSZ-NRSL/ActiveImplicitRecon.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 11 figures, Submitted to IEEE Robotics and Automation
+  Letters (RA-L)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HS-Diffusion: Semantic-Mixing Diffusion for Head Swapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinghe Wang, Lijie Liu, Miao Hua, Pengfei Zhu, Wangmeng Zuo, Qinghua Hu, Huchuan Lu, Bing Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-based head swapping task aims to stitch a source head to another source
+body flawlessly. This seldom-studied task faces two major challenges: 1)
+Preserving the head and body from various sources while generating a seamless
+transition region. 2) No paired head swapping dataset and benchmark so far. In
+this paper, we propose a semantic-mixing diffusion model for head swapping
+(HS-Diffusion) which consists of a latent diffusion model (LDM) and a semantic
+layout generator. We blend the semantic layouts of source head and source body,
+and then inpaint the transition region by the semantic layout generator,
+achieving a coarse-grained head swapping. Semantic-mixing LDM can further
+implement a fine-grained head swapping with the inpainted layout as condition
+by a progressive fusion process, while preserving head and body with
+high-quality reconstruction. To this end, we propose a semantic calibration
+strategy for natural inpainting and a neck alignment for geometric realism.
+Importantly, we construct a new image-based head swapping benchmark and design
+two tailor-designed metrics (Mask-FID and Focal-FID). Extensive experiments
+demonstrate the superiority of our framework. The code will be available:
+https://github.com/qinghew/HS-Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lawin <span class="highlight-title">Transformer</span>: Improving New-Era Vision Backbones with Multi-Scale
+  Representations for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.01615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.01615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Yan, Chuang Zhang, Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multi-level aggregation (MLA) module has emerged as a critical component
+for advancing new-era vision back-bones in semantic segmentation. In this
+paper, we propose Lawin (large window) Transformer, a novel MLA architecture
+that creatively utilizes multi-scale feature maps from the vision backbone. At
+the core of Lawin Transformer is the Lawin attention, a newly designed window
+attention mechanism capable of querying much larger context windows than local
+windows. We focus on studying the efficient and simplistic application of the
+large-window paradigm, allowing for flexible regulation of the ratio of large
+context to query and capturing multi-scale representations. We validate the
+effectiveness of Lawin Transformer on Cityscapes and ADE20K, consistently
+demonstrating great superiority to widely-used MLA modules when combined with
+new-era vision backbones. The code is available at
+https://github.com/yan-hao-tian/lawin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by https://sites.google.com/view/t4v-cvpr23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforced Disentanglement for Face Swapping without Skip Connection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07928v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07928v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohang Ren, Xingyu Chen, Pengfei Yao, Heung-Yeung Shum, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SOTA face swap models still suffer the problem of either target identity
+(i.e., shape) being leaked or the target non-identity attributes (i.e.,
+background, hair) failing to be fully preserved in the final results. We show
+that this insufficient disentanglement is caused by two flawed designs that
+were commonly adopted in prior models: (1) counting on only one compressed
+encoder to represent both the semantic-level non-identity facial
+attributes(i.e., pose) and the pixel-level non-facial region details, which is
+contradictory to satisfy at the same time; (2) highly relying on long
+skip-connections between the encoder and the final generator, leaking a certain
+amount of target face identity into the result. To fix them, we introduce a new
+face swap framework called 'WSC-swap' that gets rid of skip connections and
+uses two target encoders to respectively capture the pixel-level non-facial
+region attributes and the semantic non-identity attributes in the face region.
+To further reinforce the disentanglement learning for the target encoder, we
+employ both identity removal loss via adversarial training (i.e., GAN) and the
+non-identity preservation loss via prior 3DMM models like [11]. Extensive
+experiments on both FaceForensics++ and CelebA-HQ show that our results
+significantly outperform previous works on a rich set of metrics, including one
+novel metric for measuring identity consistency that was completely neglected
+before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional Contrastive Split Learning for Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11435v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11435v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwei Sun, Hideya Ochiai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) based on multi-modal data facilitates
+real-life applications such as home robots and medical diagnoses. One
+significant challenge is to devise a robust decentralized learning framework
+for various client models where centralized data collection is refrained due to
+confidentiality concerns. This work aims to tackle privacy-preserving VQA by
+decoupling a multi-modal model into representation modules and a contrastive
+module and leveraging inter-module gradients sharing and inter-client weight
+sharing. To this end, we propose Bidirectional Contrastive Split Learning
+(BiCSL) to train a global multi-modal model on the entire data distribution of
+decentralized clients. We employ the contrastive loss that enables a more
+efficient self-supervised learning of decentralized modules. Comprehensive
+experiments are conducted on the VQA-v2 dataset based on five SOTA VQA models,
+demonstrating the effectiveness of the proposed method. Furthermore, we inspect
+BiCSL's robustness against a dual-key backdoor attack on VQA. Consequently,
+BiCSL shows much better robustness to the multi-modal adversarial attack
+compared to the centralized learning method, which provides a promising
+approach to decentralized multi-modal learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semi-Siamese Network for Robust Change Detection Across Different
+  Domains with Applications to 3D Printing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08583v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08583v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushuo Niu, Ethan Chadwick, Anson W. K. Ma, Qian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic defect detection for 3D printing processes, which shares many
+characteristics with change detection problems, is a vital step for quality
+control of 3D printed products. However, there are some critical challenges in
+the current state of practice. First, existing methods for computer
+vision-based process monitoring typically work well only under specific camera
+viewpoints and lighting situations, requiring expensive pre-processing,
+alignment, and camera setups. Second, many defect detection techniques are
+specific to pre-defined defect patterns and/or print schematics. In this work,
+we approach the defect detection problem using a novel Semi-Siamese deep
+learning model that directly compares a reference schematic of the desired
+print and a camera image of the achieved print. The model then solves an image
+segmentation problem, precisely identifying the locations of defects of
+different types with respect to the reference schematic. Our model is designed
+to enable comparison of heterogeneous images from different domains while being
+robust against perturbations in the imaging setup such as different camera
+angles and illumination. Crucially, we show that our simple architecture, which
+is easy to pre-train for enhanced performance on new datasets, outperforms more
+complex state-of-the-art approaches based on generative adversarial networks
+and transformers. Using our model, defect localization predictions can be made
+in less than half a second per layer using a standard MacBook Pro while
+achieving an F1-score of more than 0.9, demonstrating the efficacy of using our
+method for in-situ defect detection in 3D printing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models:
+  A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Chengyu Wang, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and large language models have emerged as leading-edge
+generative models and have sparked a revolutionary impact on various aspects of
+human life. However, the practical implementation of these models has also
+exposed inherent risks, highlighting their dual nature and raising concerns
+regarding their trustworthiness. Despite the abundance of literature on this
+subject, a comprehensive survey specifically delving into the intersection of
+large-scale generative models and their trustworthiness remains largely absent.
+To bridge this gap, This paper investigates both the long-standing and emerging
+threats associated with these models across four fundamental dimensions:
+privacy, security, fairness, and responsibility. In this way, we construct an
+extensive map outlining the trustworthiness of these models, while also
+providing practical recommendations and identifying future directions. These
+efforts are crucial for promoting the trustworthy deployment of these models,
+ultimately benefiting society as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Calibration in Dense Classification with Adaptive Label
+  Perturbation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Changkun Ye, Shan Wang, Ruikai Cui, Jing Zhang, Kaihao Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safety-related applications, it is crucial to produce trustworthy deep
+neural networks whose prediction is associated with confidence that can
+represent the likelihood of correctness for subsequent decision-making.
+Existing dense binary classification models are prone to being over-confident.
+To improve model calibration, we propose Adaptive Stochastic Label Perturbation
+(ASLP) which learns a unique label perturbation level for each training image.
+ASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,
+which unifies label perturbation processes including stochastic approaches
+(like DisturbLabel), and label smoothing, to correct calibration while
+maintaining classification rates. ASLP follows Maximum Entropy Inference of
+classic statistical mechanics to maximise prediction entropy with respect to
+missing information. It performs this while: (1) preserving classification
+accuracy on known data as a conservative solution, or (2) specifically improves
+model calibration degree by minimising the gap between the prediction accuracy
+and expected confidence of the target training label. Extensive results
+demonstrate that ASLP can significantly improve calibration degrees of dense
+binary classification models on both in-distribution and out-of-distribution
+data. The code is available on https://github.com/Carlisle-Liu/ASLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengju Ye, Wei Jing, Chunyong Hu, Shikun Huang, Lingping Gao, Fangzhen Li, Jingke Wang, Ke Guo, Wencong Xiao, Weibo Mao, Hang Zheng, Kun Li, Junbo Chen, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building a multi-modality multi-task neural network toward accurate and
+robust performance is a de-facto standard in perception task of autonomous
+driving. However, leveraging such data from multiple sensors to jointly
+optimize the prediction and planning tasks remains largely unexplored. In this
+paper, we present FusionAD, to the best of our knowledge, the first unified
+framework that fuse the information from two most critical sensors, camera and
+LiDAR, goes beyond perception task. Concretely, we first build a transformer
+based multi-modality fusion network to effectively produce fusion based
+features. In constrast to camera-based end-to-end method UniAD, we then
+establish a fusion aided modality-aware prediction and status-aware planning
+modules, dubbed FMSPnP that take advantages of multi-modality features. We
+conduct extensive experiments on commonly used benchmark nuScenes dataset, our
+FusionAD achieves state-of-the-art performance and surpassing baselines on
+average 15% on perception tasks like detection and tracking, 10% on occupancy
+prediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score
+and reduces the collision rate from 0.31% to only 0.12%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Freespace Optical Flow Modeling for Automated Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Feng, Ruge Zhang, Jiayuan Du, Qijun Chen, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical flow and disparity are two informative visual features for autonomous
+driving perception. They have been used for a variety of applications, such as
+obstacle and lane detection. The concept of "U-V-Disparity" has been widely
+explored in the literature, while its counterpart in optical flow has received
+relatively little attention. Traditional motion analysis algorithms estimate
+optical flow by matching correspondences between two successive video frames,
+which limits the full utilization of environmental information and geometric
+constraints. Therefore, we propose a novel strategy to model optical flow in
+the collision-free space (also referred to as drivable area or simply
+freespace) for intelligent vehicles, with the full utilization of geometry
+information in a 3D driving environment. We provide explicit representations of
+optical flow and deduce the quadratic relationship between the optical flow
+component and the vertical coordinate. Through extensive experiments on several
+public datasets, we demonstrate the high accuracy and robustness of our model.
+Additionally, our proposed freespace optical flow model boasts a diverse array
+of applications within the realm of automated driving, providing a geometric
+constraint in freespace detection, vehicle localization, and more. We have made
+our source code publicly available at https://mias.group/FSOF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted by IEEE/ASME Transactions on
+  Mechatronics (T-Mech)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAC: 3D Representation Learning via Foreground Aware Feature Contrast <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06388v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06388v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangcheng Liu, Aoran Xiao, Xiaoqin Zhang, Shijian Lu, Ling Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has recently demonstrated great potential for
+unsupervised pre-training in 3D scene understanding tasks. However, most
+existing work randomly selects point features as anchors while building
+contrast, leading to a clear bias toward background points that often dominate
+in 3D scenes. Also, object awareness and foreground-to-background
+discrimination are neglected, making contrastive learning less effective. To
+tackle these issues, we propose a general foreground-aware feature contrast
+(FAC) framework to learn more effective point cloud representations in
+pre-training. FAC consists of two novel contrast designs to construct more
+effective and informative contrast pairs. The first is building positive pairs
+within the same foreground segment where points tend to have the same
+semantics. The second is that we prevent over-discrimination between 3D
+segments/objects and encourage foreground-to-background distinctions at the
+segment level with adaptive feature learning in a Siamese correspondence
+network, which adaptively learns feature correlations within and across point
+cloud views effectively. Visualization with point activation maps shows that
+our contrast pairs capture clear correspondences among foreground regions
+during pre-training. Quantitative experiments also show that FAC achieves
+superior knowledge transfer and data efficiency in various downstream 3D
+semantic segmentation and object detection tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/CVF Conference on Computer Vision and Pattern Recognition 2023
+  (CVPR 2023), 11 pages, the work is mainly supported by the Natural Science
+  Foundation Project of Fujian Province (2020J01826)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMUNeXt: An Efficient Medical Image Segmentation Network based on Large
+  Kernel and Skip Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenghe Tang, Jianrui Ding, Lingtao Wang, Chunping Ning, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The U-shaped architecture has emerged as a crucial paradigm in the design of
+medical image segmentation networks. However, due to the inherent local
+limitations of convolution, a fully convolutional segmentation network with
+U-shaped architecture struggles to effectively extract global context
+information, which is vital for the precise localization of lesions. While
+hybrid architectures combining CNNs and Transformers can address these issues,
+their application in real medical scenarios is limited due to the computational
+resource constraints imposed by the environment and edge devices. In addition,
+the convolutional inductive bias in lightweight networks adeptly fits the
+scarce medical data, which is lacking in the Transformer based network. In
+order to extract global context information while taking advantage of the
+inductive bias, we propose CMUNeXt, an efficient fully convolutional
+lightweight medical image segmentation network, which enables fast and accurate
+auxiliary diagnosis in real scene scenarios. CMUNeXt leverages large kernel and
+inverted bottleneck design to thoroughly mix distant spatial and location
+information, efficiently extracting global context information. We also
+introduce the Skip-Fusion block, designed to enable smooth skip-connections and
+ensure ample feature fusion. Experimental results on multiple medical image
+datasets demonstrate that CMUNeXt outperforms existing heavyweight and
+lightweight medical image segmentation networks in terms of segmentation
+performance, while offering a faster inference speed, lighter weights, and a
+reduced computational cost. The code is available at
+https://github.com/FengheTan9/CMUNeXt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with
+  Multimodal Models <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06267v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06267v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiu Lin, Samuel Yu, Zhiyi Kuang, Deepak Pathak, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to quickly learn a new task with minimal instruction - known as
+few-shot learning - is a central aspect of intelligent agents. Classical
+few-shot benchmarks make use of few-shot samples from a single modality, but
+such samples may not be sufficient to characterize an entire concept class. In
+contrast, humans use cross-modal information to learn new concepts efficiently.
+In this work, we demonstrate that one can indeed build a better ${\bf visual}$
+dog classifier by ${\bf read}$ing about dogs and ${\bf listen}$ing to them
+bark. To do so, we exploit the fact that recent multimodal foundation models
+such as CLIP are inherently cross-modal, mapping different modalities to the
+same representation space. Specifically, we propose a simple cross-modal
+adaptation approach that learns from few-shot examples spanning different
+modalities. By repurposing class names as additional one-shot training samples,
+we achieve SOTA results with an embarrassingly simple linear classifier for
+vision-language adaptation. Furthermore, we show that our approach can benefit
+existing methods such as prefix tuning, adapters, and classifier ensembling.
+Finally, to explore other modalities beyond vision and language, we construct
+the first (to our knowledge) audiovisual few-shot benchmark and use cross-modal
+training to improve the performance of both image and audio classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023. Project website:
+  https://linzhiqiu.github.io/papers/cross_modal/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ROME: Robustifying Memory-Efficient NAS via Topology Disentanglement and
+  Gradient Accumulation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.11233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.11233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxing Wang, Xiangxiang Chu, Yuda Fan, Zhexi Zhang, Bo Zhang, Xiaokang Yang, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Albeit being a prevalent architecture searching approach, differentiable
+architecture search (DARTS) is largely hindered by its substantial memory cost
+since the entire supernet resides in the memory. This is where the single-path
+DARTS comes in, which only chooses a single-path submodel at each step. While
+being memory-friendly, it also comes with low computational costs. Nonetheless,
+we discover a critical issue of single-path DARTS that has not been primarily
+noticed. Namely, it also suffers from severe performance collapse since too
+many parameter-free operations like skip connections are derived, just like
+DARTS does. In this paper, we propose a new algorithm called RObustifying
+Memory-Efficient NAS (ROME) to give a cure. First, we disentangle the topology
+search from the operation search to make searching and evaluation consistent.
+We then adopt Gumbel-Top2 reparameterization and gradient accumulation to
+robustify the unwieldy bi-level optimization. We verify ROME extensively across
+15 benchmarks to demonstrate its effectiveness and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computer Vision Estimation of Emotion Reaction Intensity in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10741v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10741v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Qian, Ali Kargarandehkordi, Onur Cezmi Mutlu, Saimourya Surabhi, Mohammadmahdi Honarmand, Dennis Paul Wall, Peter Washington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotions play an essential role in human communication. Developing computer
+vision models for automatic recognition of emotion expression can aid in a
+variety of domains, including robotics, digital behavioral healthcare, and
+media analytics. There are three types of emotional representations which are
+traditionally modeled in affective computing research: Action Units, Valence
+Arousal (VA), and Categorical Emotions. As part of an effort to move beyond
+these representations towards more fine-grained labels, we describe our
+submission to the newly introduced Emotional Reaction Intensity (ERI)
+Estimation challenge in the 5th competition for Affective Behavior Analysis
+in-the-Wild (ABAW). We developed four deep neural networks trained in the
+visual domain and a multimodal model trained with both visual and audio
+features to predict emotion reaction intensity. Our best performing model on
+the Hume-Reaction dataset achieved an average Pearson correlation coefficient
+of 0.4080 on the test set using a pre-trained ResNet50 model. This work
+provides a first step towards the development of production-grade models which
+predict emotion reaction intensities rather than discrete emotion categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlearning Spurious Correlations in Chest X-ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01119v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01119v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Misgina Tsighe Hagos, Kathleen M. Curran, Brian Mac Namee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification models are frequently trained using training
+datasets derived from multiple data sources. While leveraging multiple data
+sources is crucial for achieving model generalization, it is important to
+acknowledge that the diverse nature of these sources inherently introduces
+unintended confounders and other challenges that can impact both model accuracy
+and transparency. A notable confounding factor in medical image classification,
+particularly in musculoskeletal image classification, is skeletal
+maturation-induced bone growth observed during adolescence. We train a deep
+learning model using a Covid-19 chest X-ray dataset and we showcase how this
+dataset can lead to spurious correlations due to unintended confounding
+regions. eXplanation Based Learning (XBL) is a deep learning approach that goes
+beyond interpretability by utilizing model explanations to interactively
+unlearn spurious correlations. This is achieved by integrating interactive user
+feedback, specifically feature annotations. In our study, we employed two
+non-demanding manual feedback mechanisms to implement an XBL-based approach for
+effectively eliminating these spurious correlations. Our results underscore the
+promising potential of XBL in constructing robust models even in the presence
+of confounding factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Discovery Science 2023 conference. arXiv admin note:
+  text overlap with arXiv:2307.06026</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAP: A Model-agnostic <span class="highlight-title">Pretrain</span>ing Framework for Click-through Rate
+  Prediction <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Lin, Yanru Qu, Wei Guo, Xinyi Dai, Ruiming Tang, Yong Yu, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the widespread application of personalized online services,
+click-through rate (CTR) prediction has received more and more attention and
+research. The most prominent features of CTR prediction are its multi-field
+categorical data format, and vast and daily-growing data volume. The large
+capacity of neural models helps digest such massive amounts of data under the
+supervised learning paradigm, yet they fail to utilize the substantial data to
+its full potential, since the 1-bit click signal is not sufficient to guide the
+model to learn capable representations of features and instances. The
+self-supervised learning paradigm provides a more promising pretrain-finetune
+solution to better exploit the large amount of user click logs, and learn more
+generalized and effective representations. However, self-supervised learning
+for CTR prediction is still an open question, since current works on this line
+are only preliminary and rudimentary. To this end, we propose a Model-agnostic
+pretraining (MAP) framework that applies feature corruption and recovery on
+multi-field categorical data, and more specifically, we derive two practical
+algorithms: masked feature prediction (MFP) and replaced feature detection
+(RFD). MFP digs into feature interactions within each instance through masking
+and predicting a small portion of input features, and introduces noise
+contrastive estimation (NCE) to handle large feature spaces. RFD further turns
+MFP into a binary classification mode through replacing and detecting changes
+in input features, making it even simpler and more effective for CTR
+pretraining. Our extensive experiments on two real-world large-scale datasets
+(i.e., Avazu, Criteo) demonstrate the advantages of these two methods on
+several strong backbones (e.g., DCNv2, DeepFM), and achieve new
+state-of-the-art performance in terms of both effectiveness and efficiency for
+CTR prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Chat<span class="highlight-title">GPT</span> text-mining of clinical records for obesity
+  monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivo S. Fins, Heather Davies, Sean Farrell, Jose R. Torres, Gina Pinchbeck, Alan D. Radford, Peter-John Noble
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Veterinary clinical narratives remain a largely untapped resource
+for addressing complex diseases. Here we compare the ability of a large
+language model (ChatGPT) and a previously developed regular expression (RegexT)
+to identify overweight body condition scores (BCS) in veterinary narratives.
+Methods: BCS values were extracted from 4,415 anonymised clinical narratives
+using either RegexT or by appending the narrative to a prompt sent to ChatGPT
+coercing the model to return the BCS information. Data were manually reviewed
+for comparison. Results: The precision of RegexT was higher (100%, 95% CI
+94.81-100%) than the ChatGPT (89.3%; 95% CI82.75-93.64%). However, the recall
+of ChatGPT (100%. 95% CI 96.18-100%) was considerably higher than that of
+RegexT (72.6%, 95% CI 63.92-79.94%). Limitations: Subtle prompt engineering is
+needed to improve ChatGPT output. Conclusions: Large language models create
+diverse opportunities and, whilst complex, present an intuitive interface to
+information but require careful implementation to avoid unpredictable errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supplementary Material: The data that support the findings of this
+  study are available in the ancillary files of this submission. 5 pages, 2
+  figures (textboxes)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Slate Policy Optimization: Going Beyond Plackett-Luce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Otmane Sakhi, David Rohde, Nicolas Chopin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An increasingly important building block of large scale machine learning
+systems is based on returning slates; an ordered lists of items given a query.
+Applications of this technology include: search, information retrieval and
+recommender systems. When the action space is large, decision systems are
+restricted to a particular structure to complete online queries quickly. This
+paper addresses the optimization of these large scale decision systems given an
+arbitrary reward function. We cast this learning problem in a policy
+optimization framework and propose a new class of policies, born from a novel
+relaxation of decision functions. This results in a simple, yet efficient
+learning algorithm that scales to massive action spaces. We compare our method
+to the commonly adopted Plackett-Luce policy class and demonstrate the
+effectiveness of our approach on problems with action space sizes in the order
+of millions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Density Weighting for Multi-Interest Personalized Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Mehta, Anima Singh, Xinyang Yi, Sagar Jain, Lichan Hong, Ed H. Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using multiple user representations (MUR) to model user behavior instead of a
+single user representation (SUR) has been shown to improve personalization in
+recommendation systems. However, the performance gains observed with MUR can be
+sensitive to the skewness in the item and/or user interest distribution. When
+the data distribution is highly skewed, the gains observed by learning multiple
+representations diminish since the model dominates on head items/interests,
+leading to poor performance on tail items. Robustness to data sparsity is
+therefore essential for MUR-based approaches to achieve good performance for
+recommendations. Yet, research in MUR and data imbalance have largely been done
+independently. In this paper, we delve deeper into the shortcomings of MUR
+inferred from imbalanced data distributions. We make several contributions: (1)
+Using synthetic datasets, we demonstrate the sensitivity of MUR with respect to
+data imbalance, (2) To improve MUR for tail items, we propose an iterative
+density weighting scheme (IDW) with user tower calibration to mitigate the
+effect of training over long-tail distribution on personalization, and (3)
+Through extensive experiments on three real-world benchmarks, we demonstrate
+IDW outperforms other alternatives that address data imbalance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Recklessness to Collaborative Filtering based Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Pérez-López, Fernando Ortega, Ángel González-Prieto, Jorge Dueñas-Lerín
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems that include some reliability measure of their
+predictions tend to be more conservative in forecasting, due to their
+constraint to preserve reliability. This leads to a significant drop in the
+coverage and novelty that these systems can provide. In this paper, we propose
+the inclusion of a new term in the learning process of matrix
+factorization-based recommender systems, called recklessness, which enables the
+control of the risk level desired when making decisions about the reliability
+of a prediction. Experimental results demonstrate that recklessness not only
+allows for risk regulation but also improves the quantity and quality of
+predictions provided by the recommender system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seasonality Based Reranking of E-commerce Autocomplete Using Natural
+  Language Queries <span class="chip">KDD'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Verma, Shan Zhong, Xiaoyu Liu, Adithya Rajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query autocomplete (QAC) also known as typeahead, suggests list of complete
+queries as user types prefix in the search box. It is one of the key features
+of modern search engines specially in e-commerce. One of the goals of typeahead
+is to suggest relevant queries to users which are seasonally important. In this
+paper we propose a neural network based natural language processing (NLP)
+algorithm to incorporate seasonality as a signal and present end to end
+evaluation of the QAC ranking model. Incorporating seasonality into
+autocomplete ranking model can improve autocomplete relevance and business
+metric.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 6th Workshop on e-Commerce and NLP (ECNLP 6), KDD'23,
+  Long Beach, CA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain specificity and data efficiency in typo tolerant spell checkers:
+  the case of search in online marketplaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayananda Ubrangala, Juhi Sharma, Ravi Prasad Kondapalli, Kiran R, Amit Agarwala, Laurent Boué
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typographical errors are a major source of frustration for visitors of online
+marketplaces. Because of the domain-specific nature of these marketplaces and
+the very short queries users tend to search for, traditional spell cheking
+solutions do not perform well in correcting typos. We present a data
+augmentation method to address the lack of annotated typo data and train a
+recurrent neural network to learn context-limited domain-specific embeddings.
+Those embeddings are deployed in a real-time inferencing API for the Microsoft
+AppSource marketplace to find the closest match between a misspelled user query
+and the available product names. Our data efficient solution shows that
+controlled high quality synthetic data may be a powerful tool especially
+considering the current climate of large language models which rely on
+prohibitively huge and often uncontrolled datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Deep Supervision Network: A Noise-Resilient Approach for
+  QoS Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziliang Wang, Xiaohong Zhang, Sheng Huang, Wei Zhang, Dan Yang, Meng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quality of Service (QoS) prediction is an essential task in recommendation
+systems, where accurately predicting unknown QoS values can improve user
+satisfaction. However, existing QoS prediction techniques may perform poorly in
+the presence of noise data, such as fake location information or virtual
+gateways. In this paper, we propose the Probabilistic Deep Supervision Network
+(PDS-Net), a novel framework for QoS prediction that addresses this issue.
+PDS-Net utilizes a Gaussian-based probabilistic space to supervise intermediate
+layers and learns probability spaces for both known features and true labels.
+Moreover, PDS-Net employs a condition-based multitasking loss function to
+identify objects with noise data and applies supervision directly to deep
+features sampled from the probability space by optimizing the Kullback-Leibler
+distance between the probability space of these objects and the real-label
+probability space. Thus, PDS-Net effectively reduces errors resulting from the
+propagation of corrupted data, leading to more accurate QoS predictions.
+Experimental evaluations on two real-world QoS datasets demonstrate that the
+proposed PDS-Net outperforms state-of-the-art baselines, validating the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADRNet: A Generalized Collaborative Filtering Framework Combining
+  Clinical and Non-Clinical Data for Adverse Drug Reaction Prediction <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan Li, Taojun Hu, Zetong Xiong, Chunyuan Zheng, Fuli Feng, Xiangnan He, Xiao-Hua Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adverse drug reaction (ADR) prediction plays a crucial role in both health
+care and drug discovery for reducing patient mortality and enhancing drug
+safety. Recently, many studies have been devoted to effectively predict the
+drug-ADRs incidence rates. However, these methods either did not effectively
+utilize non-clinical data, i.e., physical, chemical, and biological information
+about the drug, or did little to establish a link between content-based and
+pure collaborative filtering during the training phase. In this paper, we first
+formulate the prediction of multi-label ADRs as a drug-ADR collaborative
+filtering problem, and to the best of our knowledge, this is the first work to
+provide extensive benchmark results of previous collaborative filtering methods
+on two large publicly available clinical datasets. Then, by exploiting the easy
+accessible drug characteristics from non-clinical data, we propose ADRNet, a
+generalized collaborative filtering framework combining clinical and
+non-clinical data for drug-ADR prediction. Specifically, ADRNet has a shallow
+collaborative filtering module and a deep drug representation module, which can
+exploit the high-dimensional drug descriptors to further guide the learning of
+low-dimensional ADR latent embeddings, which incorporates both the benefits of
+collaborative filtering and representation learning. Extensive experiments are
+conducted on two publicly available real-world drug-ADR clinical datasets and
+two non-clinical datasets to demonstrate the accuracy and efficiency of the
+proposed ADRNet. The code is available at
+https://github.com/haoxuanli-pku/ADRnet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RecSys '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weighted Multi-Level Feature Factorization for App ads CTR and
+  installation prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Manuel Rodriguez, Antonela Tommasel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides an overview of the approach we used as team ISISTANITOS
+for the ACM RecSys Challenge 2023. The competition was organized by ShareChat,
+and involved predicting the probability of a user clicking an app ad and/or
+installing an app, to improve deep funnel optimization and a special focus on
+user privacy. Our proposed method inferring the probabilities of clicking and
+installing as two different, but related tasks. Hence, the model engineers a
+specific set of features for each task and a set of shared features. Our model
+is called Weighted Multi-Level Feature Factorization because it considers the
+interaction of different order features, where the order is associated to the
+depth in a neural network. The prediction for a given task is generated by
+combining the task specific and shared features on the different levels. Our
+submission achieved the 11 rank and overall score of 55 in the competition
+academia-track final results. We release our source code at:
+https://github.com/knife982000/RecSys2023Challenge
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Form-NLU: <span class="highlight-title">Dataset</span> for the Form Natural Language Understanding <span class="chip">SIGIR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01577v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01577v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Ding, Siqu Long, Jiabin Huang, Kaixuan Ren, Xingxiang Luo, Hyunsuk Chung, Soyeon Caren Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compared to general document analysis tasks, form document structure
+understanding and retrieval are challenging. Form documents are typically made
+by two types of authors; A form designer, who develops the form structure and
+keys, and a form user, who fills out form values based on the provided keys.
+Hence, the form values may not be aligned with the form designer's intention
+(structure and keys) if a form user gets confused. In this paper, we introduce
+Form-NLU, the first novel dataset for form structure understanding and its key
+and value information extraction, interpreting the form designer's intent and
+the alignment of user-written value on it. It consists of 857 form images, 6k
+form keys and values, and 4k table keys and values. Our dataset also includes
+three form types: digital, printed, and handwritten, which cover diverse form
+appearances and layouts. We propose a robust positional and logical
+relation-based form key-value information extraction framework. Using this
+dataset, Form-NLU, we first examine strong object detection models for the form
+layout understanding, then evaluate the key information extraction task on the
+dataset, providing fine-grained results for different types of forms and keys.
+Furthermore, we examine it with the off-the-shelf pdf layout extraction tool
+and prove its feasibility in real-world cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">113</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning in Large Language Models Through Symbolic Math Word Problems <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vedant Gaur, Nikunj Saunshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have revolutionized NLP by solving downstream
+tasks with little to no labeled data. Despite their versatile abilities, the
+larger question of their ability to reason remains ill-understood. This paper
+addresses reasoning in math word problems (MWPs) by studying symbolic versions
+of the numeric problems, since a symbolic expression is a "concise explanation"
+of the numeric answer. We create and use a symbolic version of the SVAMP
+dataset and find that GPT-3's davinci-002 model also has good zero-shot
+accuracy on symbolic MWPs. To evaluate the faithfulness of the model's
+reasoning, we go beyond accuracy and additionally evaluate the alignment
+between the final answer and the outputted reasoning, which correspond to
+numeric and symbolic answers respectively for MWPs. We explore a self-prompting
+approach to encourage the symbolic reasoning to align with the numeric answer,
+thus equipping the LLM with the ability to provide a concise and verifiable
+reasoning and making it more interpretable. Surprisingly, self-prompting also
+improves the symbolic accuracy to be higher than both the numeric and symbolic
+accuracies, thus providing an ensembling effect. The SVAMP_Sym dataset will be
+released for future research on symbolic math problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Findings of ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Deformable Convolution for Depth Completion <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinglong Sun, Jean Ponce, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion, which aims to generate high-quality dense depth maps from
+sparse depth maps, has attracted increasing attention in recent years. Previous
+work usually employs RGB images as guidance, and introduces iterative spatial
+propagation to refine estimated coarse depth maps. However, most of the
+propagation refinement methods require several iterations and suffer from a
+fixed receptive field, which may contain irrelevant and useless information
+with very sparse input. In this paper, we address these two challenges
+simultaneously by revisiting the idea of deformable convolution. We propose an
+effective architecture that leverages deformable kernel convolution as a
+single-pass refinement module, and empirically demonstrate its superiority. To
+better understand the function of deformable convolution and exploit it for
+depth completion, we further systematically investigate a variety of
+representative strategies. Our study reveals that, different from prior work,
+deformable convolution needs to be applied on an estimated depth map with a
+relatively high density for better performance. We evaluate our model on the
+large-scale KITTI dataset and achieve state-of-the-art level performance in
+both accuracy and inference speed. Our code is available at
+https://github.com/AlexSunNik/ReDC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and going to appear at IROS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How many preprints have actually been printed and why: a case study of
+  computer science preprints on arXiv 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialiang Lin, Yao Yu, Yu Zhou, Zhiyang Zhou, Xiaodong Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preprints play an increasingly critical role in academic communities. There
+are many reasons driving researchers to post their manuscripts to preprint
+servers before formal submission to journals or conferences, but the use of
+preprints has also sparked considerable controversy, especially surrounding the
+claim of priority. In this paper, a case study of computer science preprints
+submitted to arXiv from 2008 to 2017 is conducted to quantify how many
+preprints have eventually been printed in peer-reviewed venues. Among those
+published manuscripts, some are published under different titles and without an
+update to their preprints on arXiv. In the case of these manuscripts, the
+traditional fuzzy matching method is incapable of mapping the preprint to the
+final published version. In view of this issue, we introduce a semantics-based
+mapping method with the employment of Bidirectional Encoder Representations
+from Transformers (BERT). With this new mapping method and a plurality of data
+sources, we find that 66% of all sampled preprints are published under
+unchanged titles and 11% are published under different titles and with other
+modifications. A further analysis was then performed to investigate why these
+preprints but not others were accepted for publication. Our comparison reveals
+that in the field of computer science, published preprints feature adequate
+revisions, multiple authorship, detailed abstract and introduction, extensive
+and authoritative references and available source code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Please cite the version of Scientometrics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Replay Sample Selection and Storage for Less Forgetting in
+  Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Brignac, Niels Lobo, Abhijit Mahalanobis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning seeks to enable deep learners to train on a series of
+tasks of unknown length without suffering from the catastrophic forgetting of
+previous tasks. One effective solution is replay, which involves storing few
+previous experiences in memory and replaying them when learning the current
+task. However, there is still room for improvement when it comes to selecting
+the most informative samples for storage and determining the optimal number of
+samples to be stored. This study aims to address these issues with a novel
+comparison of the commonly used reservoir sampling to various alternative
+population strategies and providing a novel detailed analysis of how to find
+the optimal number of stored samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exact identification of nonlinear dynamical systems by Trimmed Lasso 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shawn L. Kiser, Mikhail Guskov, Marc Rébillat, Nicolas Ranc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identification of nonlinear dynamical systems has been popularized by sparse
+identification of the nonlinear dynamics (SINDy) via the sequentially
+thresholded least squares (STLS) algorithm. Many extensions SINDy have emerged
+in the literature to deal with experimental data which are finite in length and
+noisy. Recently, the computationally intensive method of ensembling
+bootstrapped SINDy models (E-SINDy) was proposed for model identification,
+handling finite, highly noisy data. While the extensions of SINDy are numerous,
+their sparsity-promoting estimators occasionally provide sparse approximations
+of the dynamics as opposed to exact recovery. Furthermore, these estimators
+suffer under multicollinearity, e.g. the irrepresentable condition for the
+Lasso. In this paper, we demonstrate that the Trimmed Lasso for robust
+identification of models (TRIM) can provide exact recovery under more severe
+noise, finite data, and multicollinearity as opposed to E-SINDy. Additionally,
+the computational cost of TRIM is asymptotically equal to STLS since the
+sparsity parameter of the TRIM can be solved efficiently by convex solvers. We
+compare these methodologies on challenging nonlinear systems, specifically the
+Lorenz 63 system, the Bouc Wen oscillator from the nonlinear dynamics benchmark
+of No\"el and Schoukens, 2016, and a time delay system describing tool cutting
+dynamics. This study emphasizes the comparisons between STLS, reweighted
+$\ell_1$ minimization, and Trimmed Lasso in identification with respect to
+problems faced by practitioners: the problem of finite and noisy data, the
+performance of the sparse regression of when the library grows in dimension
+(multicollinearity), and automatic methods for choice of regularization
+parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DualCoOp++: Fast and Effective Adaptation to Multi-Label Recognition
+  with Limited Annotations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ping Hu, Ximeng Sun, Stan Sclaroff, Kate Saenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-label image recognition in the low-label regime is a task of great
+challenge and practical significance. Previous works have focused on learning
+the alignment between textual and visual spaces to compensate for limited image
+labels, yet may suffer from reduced accuracy due to the scarcity of
+high-quality multi-label annotations. In this research, we leverage the
+powerful alignment between textual and visual features pretrained with millions
+of auxiliary image-text pairs. We introduce an efficient and effective
+framework called Evidence-guided Dual Context Optimization (DualCoOp++), which
+serves as a unified approach for addressing partial-label and zero-shot
+multi-label recognition. In DualCoOp++ we separately encode evidential,
+positive, and negative contexts for target classes as parametric components of
+the linguistic input (i.e., prompts). The evidential context aims to discover
+all the related visual content for the target class, and serves as guidance to
+aggregate positive and negative contexts from the spatial domain of the image,
+enabling better distinguishment between similar categories. Additionally, we
+introduce a Winner-Take-All module that promotes inter-class interaction during
+training, while avoiding the need for extra parameters and costs. As DualCoOp++
+imposes minimal additional learnable overhead on the pretrained vision-language
+framework, it enables rapid adaptation to multi-label recognition tasks with
+limited annotations and even unseen classes. Experiments on standard
+multi-label recognition benchmarks across two challenging low-label settings
+demonstrate the superior performance of our approach compared to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible. arXiv admin note: substantial text overlap with
+  arXiv:2206.09541</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Statistical Estimation Under Distribution Shift: Wasserstein
+  Perturbations and Minimax Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Chao, Edgar Dobriban
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distribution shifts are a serious concern in modern statistical learning as
+they can systematically change the properties of the data away from the truth.
+We focus on Wasserstein distribution shifts, where every data point may undergo
+a slight perturbation, as opposed to the Huber contamination model where a
+fraction of observations are outliers. We formulate and study shifts beyond
+independent perturbations, exploring Joint Distribution Shifts, where the
+per-observation perturbations can be coordinated. We analyze several important
+statistical problems, including location estimation, linear regression, and
+non-parametric density estimation. Under a squared loss for mean estimation and
+prediction error in linear regression, we find the exact minimax risk, a least
+favorable perturbation, and show that the sample mean and least squares
+estimators are respectively optimal. This holds for both independent and joint
+shifts, but the least favorable perturbations and minimax risks differ. For
+other problems, we provide nearly optimal estimators and precise finite-sample
+bounds. We also introduce several tools for bounding the minimax risk under
+distribution shift, such as a smoothing technique for location families, and
+generalizations of classical tools including least favorable sequences of
+priors, the modulus of continuity, Le Cam's, Fano's, and Assouad's methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>60 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curricular Transfer Learning for Sentence Encoded Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jader Martins Camboim de Sá, Matheus Ferraroni Sanches, Rafael Roque de Souza, Júlio Cesar dos Reis, Leandro Aparecido Villas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning language models in a downstream task is the standard approach for
+many state-of-the-art methodologies in the field of NLP. However, when the
+distribution between the source task and target task drifts, \textit{e.g.},
+conversational environments, these gains tend to be diminished. This article
+proposes a sequence of pre-training steps (a curriculum) guided by "data
+hacking" and grammar analysis that allows further gradual adaptation between
+pre-training distributions. In our experiments, we acquire a considerable
+improvement from our method compared to other known pre-training approaches for
+the MultiWoZ task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ URET: Universal Robustness Evaluation Toolkit (for Evasion) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Eykholt, Taesung Lee, Douglas Schales, Jiyong Jang, Ian Molloy, Masha Zorin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are known to be vulnerable to adversarial evasion
+attacks as illustrated by image classification models. Thoroughly understanding
+such attacks is critical in order to ensure the safety and robustness of
+critical AI tasks. However, most evasion attacks are difficult to deploy
+against a majority of AI systems because they have focused on image domain with
+only few constraints. An image is composed of homogeneous, numerical,
+continuous, and independent features, unlike many other input types to AI
+systems used in practice. Furthermore, some input types include additional
+semantic and functional constraints that must be observed to generate realistic
+adversarial inputs. In this work, we propose a new framework to enable the
+generation of adversarial inputs irrespective of the input type and task
+domain. Given an input and a set of pre-defined input transformations, our
+framework discovers a sequence of transformations that result in a semantically
+correct and functional adversarial input. We demonstrate the generality of our
+approach on several diverse machine learning tasks with various input
+representations. We also show the importance of generating adversarial examples
+as they enable the deployment of mitigation techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at USENIX '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Capability of Large Language Models to Measure Psychiatric
+  Functioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac R. Galatzer-Levy, Daniel McDuff, Vivek Natarajan, Alan Karthikesalingam, Matteo Malgaroli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current work investigates the capability of Large language models (LLMs)
+that are explicitly trained on large corpuses of medical knowledge (Med-PaLM 2)
+to predict psychiatric functioning from patient interviews and clinical
+descriptions without being trained to do so. To assess this, n = 145 depression
+and n =115 PTSD assessments and n = 46 clinical case studies across high
+prevalence/high comorbidity disorders (Depressive, Anxiety, Psychotic, trauma
+and stress, Addictive disorders) were analyzed using prompts to extract
+estimated clinical scores and diagnoses. Results demonstrate that Med-PaLM 2 is
+capable of assessing psychiatric functioning across a range of psychiatric
+conditions with the strongest performance being the prediction of depression
+scores based on standardized assessments (Accuracy range= 0.80 - 0.84) which
+were statistically indistinguishable from human clinical raters t(1,144) =
+1.20; p = 0.23. Results show the potential for general clinical language models
+to flexibly predict psychiatric risk based on free descriptions of functioning
+from both patients and clinicians.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distribution-Free Inference for the Regression Function of Binary
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ambrus Tamás, Balázs Csanád Csáji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the key objects of binary classification is the regression function,
+i.e., the conditional expectation of the class labels given the inputs. With
+the regression function not only a Bayes optimal classifier can be defined, but
+it also encodes the corresponding misclassification probabilities. The paper
+presents a resampling framework to construct exact, distribution-free and
+non-asymptotically guaranteed confidence regions for the true regression
+function for any user-chosen confidence level. Then, specific algorithms are
+suggested to demonstrate the framework. It is proved that the constructed
+confidence regions are strongly consistent, that is, any false model is
+excluded in the long run with probability one. The exclusion is quantified with
+probably approximately correct type bounds, as well. Finally, the algorithms
+are validated via numerical experiments, and the methods are compared to
+approximate asymptotic confidence ellipsoids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hard Adversarial Example Mining for Improving Robust Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhao Lin, Xiang Ji, Yulong Yang, Qian Li, Chao Shen, Run Wang, Liming Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training (AT) is widely considered the state-of-the-art technique
+for improving the robustness of deep neural networks (DNNs) against adversarial
+examples (AE). Nevertheless, recent studies have revealed that adversarially
+trained models are prone to unfairness problems, restricting their
+applicability. In this paper, we empirically observe that this limitation may
+be attributed to serious adversarial confidence overfitting, i.e., certain
+adversarial examples with overconfidence. To alleviate this problem, we propose
+HAM, a straightforward yet effective framework via adaptive Hard Adversarial
+example Mining.HAM concentrates on mining hard adversarial examples while
+discarding the easy ones in an adaptive fashion. Specifically, HAM identifies
+hard AEs in terms of their step sizes needed to cross the decision boundary
+when calculating loss value. Besides, an early-dropping mechanism is
+incorporated to discard the easy examples at the initial stages of AE
+generation, resulting in efficient AT. Extensive experimental results on
+CIFAR-10, SVHN, and Imagenette demonstrate that HAM achieves significant
+improvement in robust fairness while reducing computational cost compared to
+several state-of-the-art adversarial training methods. The code will be made
+publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tensor Programs IVb: Adaptive Optimization in the Infinite-Width Limit <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Greg Yang, Etai Littwin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Going beyond stochastic gradient descent (SGD), what new phenomena emerge in
+wide neural networks trained by adaptive optimizers like Adam? Here we show:
+The same dichotomy between feature learning and kernel behaviors (as in SGD)
+holds for general optimizers as well, including Adam -- albeit with a nonlinear
+notion of "kernel." We derive the corresponding "neural tangent" and "maximal
+update" limits for any architecture. Two foundational advances underlie the
+above results: 1) A new Tensor Program language, NEXORT, that can express how
+adaptive optimizers process gradients into updates. 2) The introduction of
+bra-ket notation to drastically simplify expressions and calculations in Tensor
+Programs. This work summarizes and generalizes all previous results in the
+Tensor Programs series of papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the complete version of "Adaptive Optimization in the
+  Infinite-Width Limit" in ICLR 2023,
+  https://openreview.net/forum?id=zgVDqw9ZUES</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Job Shop Scheduling via Deep Reinforcement Learning: a Sequence to
+  Sequence approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Bonetta, Davide Zago, Rossella Cancelliere, Andrea Grosso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Job scheduling is a well-known Combinatorial Optimization problem with
+endless applications. Well planned schedules bring many benefits in the context
+of automated systems: among others, they limit production costs and waste.
+Nevertheless, the NP-hardness of this problem makes it essential to use
+heuristics whose design is difficult, requires specialized knowledge and often
+produces methods tailored to the specific task. This paper presents an original
+end-to-end Deep Reinforcement Learning approach to scheduling that
+automatically learns dispatching rules. Our technique is inspired by natural
+language encoder-decoder models for sequence processing and has never been
+used, to the best of our knowledge, for scheduling purposes. We applied and
+tested our method in particular to some benchmark instances of Job Shop
+Problem, but this technique is general enough to be potentially used to tackle
+other different optimal job scheduling tasks with minimal intervention. Results
+demonstrate that we outperform many classical approaches exploiting priority
+dispatching rules and show competitive results on state-of-the-art Deep
+Reinforcement Learning ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Adaptative Variational Quantum Algorithms on QUBO Instances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gloria Turati, Maurizio Ferrari Dacrema, Paolo Cremonesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Variational Quantum Algorithms (VQAs) have emerged as a
+promising approach for solving optimization problems on quantum computers in
+the NISQ era. However, one limitation of VQAs is their reliance on
+fixed-structure circuits, which may not be taylored for specific problems or
+hardware configurations. A leading strategy to address this issue are
+Adaptative VQAs, which dynamically modify the circuit structure by adding and
+removing gates, and optimize their parameters during the training. Several
+Adaptative VQAs, based on heuristics such as circuit shallowness, entanglement
+capability and hardware compatibility, have already been proposed in the
+literature, but there is still lack of a systematic comparison between the
+different methods. In this paper, we aim to fill this gap by analyzing three
+Adaptative VQAs: Evolutionary Variational Quantum Eigensolver (EVQE), Variable
+Ansatz (VAns), already proposed in the literature, and Random Adapt-VQE
+(RA-VQE), a random approach we introduce as a baseline. In order to compare
+these algorithms to traditional VQAs, we also include the Quantum Approximate
+Optimization Algorithm (QAOA) in our analysis. We apply these algorithms to
+QUBO problems and study their performance by examining the quality of the
+solutions found and the computational times required. Additionally, we
+investigate how the choice of the hyperparameters can impact the overall
+performance of the algorithms, highlighting the importance of selecting an
+appropriate methodology for hyperparameter tuning. Our analysis sets benchmarks
+for Adaptative VQAs designed for near-term quantum devices and provides
+valuable insights to guide future research in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-based Prediction of Stress and Strain Maps in Arterial
+  Walls for Improved Cardiovascular Risk Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasin Shokrollahi1, Pengfei Dong1, Xianqi Li, Linxia Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigated the potential of end-to-end deep learning tools as a
+more effective substitute for FEM in predicting stress-strain fields within 2D
+cross sections of arterial wall. We first proposed a U-Net based fully
+convolutional neural network (CNN) to predict the von Mises stress and strain
+distribution based on the spatial arrangement of calcification within arterial
+wall cross-sections. Further, we developed a conditional generative adversarial
+network (cGAN) to enhance, particularly from the perceptual perspective, the
+prediction accuracy of stress and strain field maps for arterial walls with
+various calcification quantities and spatial configurations. On top of U-Net
+and cGAN, we also proposed their ensemble approaches, respectively, to further
+improve the prediction accuracy of field maps. Our dataset, consisting of input
+and output images, was generated by implementing boundary conditions and
+extracting stress-strain field maps. The trained U-Net models can accurately
+predict von Mises stress and strain fields, with structural similarity index
+scores (SSIM) of 0.854 and 0.830 and mean squared errors of 0.017 and 0.018 for
+stress and strain, respectively, on a reserved test set. Meanwhile, the cGAN
+models in a combination of ensemble and transfer learning techniques
+demonstrate high accuracy in predicting von Mises stress and strain fields, as
+evidenced by SSIM scores of 0.890 for stress and 0.803 for strain.
+Additionally, mean squared errors of 0.008 for stress and 0.017 for strain
+further support the model's performance on a designated test set. Overall, this
+study developed a surrogate model for finite element analysis, which can
+accurately and efficiently predict stress-strain fields of arterial walls
+regardless of complex geometries and boundary conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bag of Policies for Distributional Deep Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asen Nachkov, Luchen Li, Giulia Luise, Filippo Valdettaro, Aldo Faisal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient exploration in complex environments remains a major challenge for
+reinforcement learning (RL). Compared to previous Thompson sampling-inspired
+mechanisms that enable temporally extended exploration, i.e., deep exploration,
+we focus on deep exploration in distributional RL. We develop here a general
+purpose approach, Bag of Policies (BoP), that can be built on top of any return
+distribution estimator by maintaining a population of its copies. BoP consists
+of an ensemble of multiple heads that are updated independently. During
+training, each episode is controlled by only one of the heads and the collected
+state-action pairs are used to update all heads off-policy, leading to distinct
+learning signals for each head which diversify learning and behaviour. To test
+whether optimistic ensemble method can improve on distributional RL as did on
+scalar RL, by e.g. Bootstrapped DQN, we implement the BoP approach with a
+population of distributional actor-critics using Bayesian Distributional Policy
+Gradients (BDPG). The population thus approximates a posterior distribution of
+return distributions along with a posterior distribution of policies. Another
+benefit of building upon BDPG is that it allows to analyze global posterior
+uncertainty along with local curiosity bonus simultaneously for exploration. As
+BDPG is already an optimistic method, this pairing helps to investigate if
+optimism is accumulatable in distributional RL. Overall BoP results in greater
+robustness and speed during learning as demonstrated by our experimental
+results on ALE Atari games.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Collapse Terminus: A Unified Solution for Class Incremental
+  Learning and Its Variants <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Yang, Haobo Yuan, Xiangtai Li, Jianlong Wu, Lefei Zhang, Zhouchen Lin, Philip Torr, Dacheng Tao, Bernard Ghanem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to enable learnability for new classes while keeping the capability well
+on old classes has been a crucial challenge for class incremental learning.
+Beyond the normal case, long-tail class incremental learning and few-shot class
+incremental learning are also proposed to consider the data imbalance and data
+scarcity, respectively, which are common in real-world implementations and
+further exacerbate the well-known problem of catastrophic forgetting. Existing
+methods are specifically proposed for one of the three tasks. In this paper, we
+offer a unified solution to the misalignment dilemma in the three tasks.
+Concretely, we propose neural collapse terminus that is a fixed structure with
+the maximal equiangular inter-class separation for the whole label space. It
+serves as a consistent target throughout the incremental training to avoid
+dividing the feature space incrementally. For CIL and LTCIL, we further propose
+a prototype evolving scheme to drive the backbone features into our neural
+collapse terminus smoothly. Our method also works for FSCIL with only minor
+adaptations. Theoretical analysis indicates that our method holds the neural
+collapse optimality in an incremental fashion regardless of data imbalance or
+data scarcity. We also design a generalized case where we do not know the total
+number of classes and whether the data distribution is normal, long-tail, or
+few-shot for each coming session, to test the generalizability of our method.
+Extensive experiments with multiple datasets are conducted to demonstrate the
+effectiveness of our unified solution to all the three tasks and the
+generalized case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of our ICLR 2023 paper
+  https://openreview.net/pdf?id=y5W8tpojhtJ. arXiv admin note: text overlap
+  with arXiv:2302.03004</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multitask Learning with No Regret: from Improved Confidence Bounds to
+  Active Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pier Giuseppe Sessa, Pierre Laforgue, Nicolò Cesa-Bianchi, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multitask learning is a powerful framework that enables one to simultaneously
+learn multiple related tasks by sharing information between them. Quantifying
+uncertainty in the estimated tasks is of pivotal importance for many downstream
+applications, such as online or active learning. In this work, we provide novel
+multitask confidence intervals in the challenging agnostic setting, i.e., when
+neither the similarity between tasks nor the tasks' features are available to
+the learner. The obtained intervals do not require i.i.d. data and can be
+directly applied to bound the regret in online learning. Through a refined
+analysis of the multitask information gain, we obtain new regret guarantees
+that, depending on a task similarity parameter, can significantly improve over
+treating tasks independently. We further propose a novel online learning
+algorithm that achieves such improved regret without knowing this parameter in
+advance, i.e., automatically adapting to task similarity. As a second key
+application of our results, we introduce a novel multitask active learning
+setup where several tasks must be simultaneously optimized, but only one of
+them can be queried for feedback by the learner at each round. For this
+problem, we design a no-regret algorithm that uses our confidence intervals to
+decide which task should be queried. Finally, we empirically validate our
+bounds and algorithms on synthetic and real-world (drug discovery) data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding the Optimum Design of Large Gas Engines Prechambers Using CFD
+  and Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Posch, Clemens Gößnitzer, Franz Rohrhofer, Bernhard C. Geiger, Andreas Wimmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The turbulent jet ignition concept using prechambers is a promising solution
+to achieve stable combustion at lean conditions in large gas engines, leading
+to high efficiency at low emission levels. Due to the wide range of design and
+operating parameters for large gas engine prechambers, the preferred method for
+evaluating different designs is computational fluid dynamics (CFD), as testing
+in test bed measurement campaigns is time-consuming and expensive. However, the
+significant computational time required for detailed CFD simulations due to the
+complexity of solving the underlying physics also limits its applicability. In
+optimization settings similar to the present case, i.e., where the evaluation
+of the objective function(s) is computationally costly, Bayesian optimization
+has largely replaced classical design-of-experiment. Thus, the present study
+deals with the computationally efficient Bayesian optimization of large gas
+engine prechambers design using CFD simulation. Reynolds-averaged-Navier-Stokes
+simulations are used to determine the target values as a function of the
+selected prechamber design parameters. The results indicate that the chosen
+strategy is effective to find a prechamber design that achieves the desired
+target values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages. Part of Scientific Computing 2023 Conference Proceedings
+  (ISBN e-Book: 978-3-903318-20-5)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Multi-Label Correlation in Label Distribution Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01742v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01742v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiang Kou jing wang yuheng jia xin geng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label Distribution Learning (LDL) is a novel machine learning paradigm that
+assigns label distribution to each instance. Many LDL methods proposed to
+leverage label correlation in the learning process to solve the
+exponential-sized output space; among these, many exploited the low-rank
+structure of label distribution to capture label correlation. However, recent
+studies disclosed that label distribution matrices are typically full-rank,
+posing challenges to those works exploiting low-rank label correlation. Note
+that multi-label is generally low-rank; low-rank label correlation is widely
+adopted in multi-label learning (MLL) literature. Inspired by that, we
+introduce an auxiliary MLL process in LDL and capture low-rank label
+correlation on that MLL rather than LDL. In such a way, low-rank label
+correlation is appropriately exploited in our LDL methods. We conduct
+comprehensive experiments and demonstrate that our methods are superior to
+existing LDL methods. Besides, the ablation studies justify the advantages of
+exploiting low-rank label correlation in the auxiliary MLL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAP: A Model-agnostic <span class="highlight-title">Pretrain</span>ing Framework for Click-through Rate
+  Prediction <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Lin, Yanru Qu, Wei Guo, Xinyi Dai, Ruiming Tang, Yong Yu, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the widespread application of personalized online services,
+click-through rate (CTR) prediction has received more and more attention and
+research. The most prominent features of CTR prediction are its multi-field
+categorical data format, and vast and daily-growing data volume. The large
+capacity of neural models helps digest such massive amounts of data under the
+supervised learning paradigm, yet they fail to utilize the substantial data to
+its full potential, since the 1-bit click signal is not sufficient to guide the
+model to learn capable representations of features and instances. The
+self-supervised learning paradigm provides a more promising pretrain-finetune
+solution to better exploit the large amount of user click logs, and learn more
+generalized and effective representations. However, self-supervised learning
+for CTR prediction is still an open question, since current works on this line
+are only preliminary and rudimentary. To this end, we propose a Model-agnostic
+pretraining (MAP) framework that applies feature corruption and recovery on
+multi-field categorical data, and more specifically, we derive two practical
+algorithms: masked feature prediction (MFP) and replaced feature detection
+(RFD). MFP digs into feature interactions within each instance through masking
+and predicting a small portion of input features, and introduces noise
+contrastive estimation (NCE) to handle large feature spaces. RFD further turns
+MFP into a binary classification mode through replacing and detecting changes
+in input features, making it even simpler and more effective for CTR
+pretraining. Our extensive experiments on two real-world large-scale datasets
+(i.e., Avazu, Criteo) demonstrate the advantages of these two methods on
+several strong backbones (e.g., DCNv2, DeepFM), and achieve new
+state-of-the-art performance in terms of both effectiveness and efficiency for
+CTR prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantification of Predictive Uncertainty via Inference-Time Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarína Tóthová, Ľubor Ladický, Daniel Thul, Marc Pollefeys, Ender Konukoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive variability due to data ambiguities has typically been addressed
+via construction of dedicated models with built-in probabilistic capabilities
+that are trained to predict uncertainty estimates as variables of interest.
+These approaches require distinct architectural components and training
+mechanisms, may include restrictive assumptions and exhibit overconfidence,
+i.e., high confidence in imprecise predictions. In this work, we propose a
+post-hoc sampling strategy for estimating predictive uncertainty accounting for
+data ambiguity. The method can generate different plausible outputs for a given
+input and does not assume parametric forms of predictive distributions. It is
+architecture agnostic and can be applied to any feed-forward deterministic
+network without changes to the architecture or training procedure. Experiments
+on regression tasks on imaging and non-imaging input data show the method's
+ability to generate diverse and multi-modal predictive distributions, and a
+desirable correlation of the estimated uncertainty with the prediction error.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Telematics Combined Actuarial Neural Networks for Cross-Sectional and
+  Longitudinal Claim Count Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francis Duval, Jean-Philippe Boucher, Mathieu Pigeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present novel cross-sectional and longitudinal claim count models for
+vehicle insurance built upon the Combined Actuarial Neural Network (CANN)
+framework proposed by Mario W\"uthrich and Michael Merz. The CANN approach
+combines a classical actuarial model, such as a generalized linear model, with
+a neural network. This blending of models results in a two-component model
+comprising a classical regression model and a neural network part. The CANN
+model leverages the strengths of both components, providing a solid foundation
+and interpretability from the classical model while harnessing the flexibility
+and capacity to capture intricate relationships and interactions offered by the
+neural network. In our proposed models, we use well-known log-linear claim
+count regression models for the classical regression part and a multilayer
+perceptron (MLP) for the neural network part. The MLP part is used to process
+telematics car driving data given as a vector characterizing the driving
+behavior of each insured driver. In addition to the Poisson and negative
+binomial distributions for cross-sectional data, we propose a procedure for
+training our CANN model with a multivariate negative binomial (MVNB)
+specification. By doing so, we introduce a longitudinal model that accounts for
+the dependence between contracts from the same insured. Our results reveal that
+the CANN models exhibit superior performance compared to log-linear models that
+rely on manually engineered telematics features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 tables, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Link Prediction Explanations for Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudio Borile, Alan Perotti, André Panisson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Machine Learning (GML) has numerous applications, such as node/graph
+classification and link prediction, in real-world domains. Providing
+human-understandable explanations for GML models is a challenging yet
+fundamental task to foster their adoption, but validating explanations for link
+prediction models has received little attention. In this paper, we provide
+quantitative metrics to assess the quality of link prediction explanations,
+with or without ground-truth. State-of-the-art explainability methods for Graph
+Neural Networks are evaluated using these metrics. We discuss how underlying
+assumptions and technical details specific to the link prediction task, such as
+the choice of distance between node embeddings, can influence the quality of
+the explanations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted to be presented to The 1st World
+  Conference on eXplainable Artificial Intelligence (xAI 2023), July 26-28,
+  2023 - Lisboa, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiency of First-Order Methods for Low-Rank Tensor Recovery with the
+  Tensor Nuclear Norm Under Strict Complementarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Garber, Atara Kaplan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider convex relaxations for recovering low-rank tensors based on
+constrained minimization over a ball induced by the tensor nuclear norm,
+recently introduced in \cite{tensor_tSVD}. We build on a recent line of results
+that considered convex relaxations for the recovery of low-rank matrices and
+established that under a strict complementarity condition (SC), both the
+convergence rate and per-iteration runtime of standard gradient methods may
+improve dramatically. We develop the appropriate strict complementarity
+condition for the tensor nuclear norm ball and obtain the following main
+results under this condition: 1. When the objective to minimize is of the form
+$f(\mX)=g(\mA\mX)+\langle{\mC,\mX}\rangle$ , where $g$ is strongly convex and
+$\mA$ is a linear map (e.g., least squares), a quadratic growth bound holds,
+which implies linear convergence rates for standard projected gradient methods,
+despite the fact that $f$ need not be strongly convex. 2. For a smooth
+objective function, when initialized in certain proximity of an optimal
+solution which satisfies SC, standard projected gradient methods only require
+SVD computations (for projecting onto the tensor nuclear norm ball) of rank
+that matches the tubal rank of the optimal solution. In particular, when the
+tubal rank is constant, this implies nearly linear (in the size of the tensor)
+runtime per iteration, as opposed to super linear without further assumptions.
+3. For a nonsmooth objective function which admits a popular smooth
+saddle-point formulation, we derive similar results to the latter for the well
+known extragradient method. An additional contribution which may be of
+independent interest, is the rigorous extension of many basic results regarding
+tensors of arbitrary order, which were previously obtained only for third-order
+tensors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Reinforcement Learning of Koopman Models for Economic
+  Nonlinear MPC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Mayfrank, Alexander Mitsos, Manuel Dahmen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  (Economic) nonlinear model predictive control ((e)NMPC) requires dynamic
+system models that are sufficiently accurate in all relevant state-space
+regions. These models must also be computationally cheap enough to ensure
+real-time tractability. Data-driven surrogate models for mechanistic models can
+be used to reduce the computational burden of (e)NMPC; however, such models are
+typically trained by system identification for maximum average prediction
+accuracy on simulation samples and perform suboptimally as part of actual
+(e)NMPC. We present a method for end-to-end reinforcement learning of dynamic
+surrogate models for optimal performance in (e)NMPC applications, resulting in
+predictive controllers that strike a favorable balance between control
+performance and computational demand. We validate our method on two
+applications derived from an established nonlinear continuous stirred-tank
+reactor model. We compare the controller performance to that of MPCs utilizing
+models trained by the prevailing maximum prediction accuracy paradigm, and
+model-free neural network controllers trained using reinforcement learning. We
+show that our method matches the performance of the model-free neural network
+controllers while consistently outperforming models derived from system
+identification. Additionally, we show that the MPC policies can react to
+changes in the control setting without retraining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>manuscript (18 pages, 7 figures, 5 tables), supplementary materials
+  (3 pages, 2 tables)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniG-Encoder: A Universal Feature Encoder for Graph and Hypergraph Node
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhao Zou, Zhongxue Gan, Yutong Wang, Junheng Zhang, Dongyan Sui, Chun Guan, Siyang Leng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph and hypergraph representation learning has attracted increasing
+attention from various research fields. Despite the decent performance and
+fruitful applications of Graph Neural Networks (GNNs), Hypergraph Neural
+Networks (HGNNs), and their well-designed variants, on some commonly used
+benchmark graphs and hypergraphs, they are outperformed by even a simple
+Multi-Layer Perceptron. This observation motivates a reexamination of the
+design paradigm of the current GNNs and HGNNs and poses challenges of
+extracting graph features effectively. In this work, a universal feature
+encoder for both graph and hypergraph representation learning is designed,
+called UniG-Encoder. The architecture starts with a forward transformation of
+the topological relationships of connected nodes into edge or hyperedge
+features via a normalized projection matrix. The resulting edge/hyperedge
+features, together with the original node features, are fed into a neural
+network. The encoded node embeddings are then derived from the reversed
+transformation, described by the transpose of the projection matrix, of the
+network's output, which can be further used for tasks such as node
+classification. The proposed architecture, in contrast to the traditional
+spectral-based and/or message passing approaches, simultaneously and
+comprehensively exploits the node features and graph/hypergraph topologies in
+an efficient and unified manner, covering both heterophilic and homophilic
+graphs. The designed projection matrix, encoding the graph features, is
+intuitive and interpretable. Extensive experiments are conducted and
+demonstrate the superior performance of the proposed framework on twelve
+representative hypergraph datasets and six real-world graph datasets, compared
+to the state-of-the-art methods. Our implementation is available online at
+https://github.com/MinhZou/UniG-Encoder.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MARLIM: Multi-Agent Reinforcement Learning for Inventory Management <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rémi Leluc, Elie Kadoche, Antoine Bertoncello, Sébastien Gourvénec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maintaining a balance between the supply and demand of products by optimizing
+replenishment decisions is one of the most important challenges in the supply
+chain industry. This paper presents a novel reinforcement learning framework
+called MARLIM, to address the inventory management problem for a single-echelon
+multi-products supply chain with stochastic demands and lead-times. Within this
+context, controllers are developed through single or multiple agents in a
+cooperative setting. Numerical experiments on real data demonstrate the
+benefits of reinforcement learning methods over traditional baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2022 Workshop: Reinforcement Learning for Real
+  Life (https://nips.cc/virtual/2022/workshop/50014)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interleaving GANs with knowledge graphs to support design creativity for
+  book covers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandru Motogna, Adrian Groza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An attractive book cover is important for the success of a book. In this
+paper, we apply Generative Adversarial Networks (GANs) to the book covers
+domain, using different methods for training in order to obtain better
+generated images. We interleave GANs with knowledge graphs to alter the input
+title to obtain multiple possible options for any given title, which are then
+used as an augmented input to the generator. Finally, we use the discriminator
+obtained during the training phase to select the best images generated with new
+titles. Our method performed better at generating book covers than previous
+attempts, and the knowledge graph gives better options to the book author or
+editor compared to using GANs alone.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Convolutional Neural Network Architecture with a Continuous
+  Symmetry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Liu, Hang Shao, Bing Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new Convolutional Neural Network (ConvNet)
+architecture inspired by a class of partial differential equations (PDEs)
+called quasi-linear hyperbolic systems. With comparable performance on image
+classification task, it allows for the modification of the weights via a
+continuous group of symmetry. This is a significant shift from traditional
+models where the architecture and weights are essentially fixed. We wish to
+promote the (internal) symmetry as a new desirable property for a neural
+network, and to draw attention to the PDE perspective in analyzing and
+interpreting ConvNets in the broader Deep Learning community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 3rd CAAI International Conference on Artificial
+  Intelligence (CICAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Systematic Weaknesses of DNNs using Counterfactuals <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01614v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01614v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sujan Sai Gannamaneni, Michael Mock, Maram Akila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of DNNs into safety-critical applications, testing
+approaches for such models have gained more attention. A current direction is
+the search for and identification of systematic weaknesses that put safety
+assumptions based on average performance values at risk. Such weaknesses can
+take on the form of (semantically coherent) subsets or areas in the input space
+where a DNN performs systematically worse than its expected average. However,
+it is non-trivial to attribute the reason for such observed low performances to
+the specific semantic features that describe the subset. For instance,
+inhomogeneities within the data w.r.t. other (non-considered) attributes might
+distort results. However, taking into account all (available) attributes and
+their interaction is often computationally highly expensive. Inspired by
+counterfactual explanations, we propose an effective and computationally cheap
+algorithm to validate the semantic attribution of existing subsets, i.e., to
+check whether the identified attribute is likely to have caused the degraded
+performance. We demonstrate this approach on an example from the autonomous
+driving domain using highly annotated simulated data, where we show for a
+semantic segmentation model that (i) performance differences among the
+different pedestrian assets exist, but (ii) only in some cases is the asset
+type itself the reason for this reduction in the performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI Spring Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Noise Boosts DNN Generalization under Label Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Zeng, Xuan Chen, Xiaoshuang Shi, Heng Tao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The presence of label noise in the training data has a profound impact on the
+generalization of deep neural networks (DNNs). In this study, we introduce and
+theoretically demonstrate a simple feature noise method, which directly adds
+noise to the features of training data, can enhance the generalization of DNNs
+under label noise. Specifically, we conduct theoretical analyses to reveal that
+label noise leads to weakened DNN generalization by loosening the PAC-Bayes
+generalization bound, and feature noise results in better DNN generalization by
+imposing an upper bound on the mutual information between the model weights and
+the features, which constrains the PAC-Bayes generalization bound. Furthermore,
+to ensure effective generalization of DNNs in the presence of label noise, we
+conduct application analyses to identify the optimal types and levels of
+feature noise to add for obtaining desirable label noise generalization.
+Finally, extensive experimental results on several popular datasets demonstrate
+the feature noise method can significantly enhance the label noise
+generalization of the state-of-the-art label noise method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Multiplex Graph Learning with Complementary and Consistent
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Peng, Xin Wang, Xiaofeng Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised multiplex graph learning (UMGL) has been shown to achieve
+significant effectiveness for different downstream tasks by exploring both
+complementary information and consistent information among multiple graphs.
+However, previous methods usually overlook the issues in practical
+applications, i.e., the out-of-sample issue and the noise issue. To address the
+above issues, in this paper, we propose an effective and efficient UMGL method
+to explore both complementary and consistent information. To do this, our
+method employs multiple MLP encoders rather than graph convolutional network
+(GCN) to conduct representation learning with two constraints, i.e., preserving
+the local graph structure among nodes to handle the out-of-sample issue, and
+maximizing the correlation of multiple node representations to handle the noise
+issue. Comprehensive experiments demonstrate that our proposed method achieves
+superior effectiveness and efficiency over the comparison methods and
+effectively tackles those two issues. Code is available at
+https://github.com/LarryUESTC/CoCoMG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning-based surrogate models for parametrized PDEs: handling
+  geometric variability through graph neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01602v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01602v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Rares Franco, Stefania Fresca, Filippo Tombari, Andrea Manzoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh-based simulations play a key role when modeling complex physical systems
+that, in many disciplines across science and engineering, require the solution
+of parametrized time-dependent nonlinear partial differential equations (PDEs).
+In this context, full order models (FOMs), such as those relying on the finite
+element method, can reach high levels of accuracy, however often yielding
+intensive simulations to run. For this reason, surrogate models are developed
+to replace computationally expensive solvers with more efficient ones, which
+can strike favorable trade-offs between accuracy and efficiency. This work
+explores the potential usage of graph neural networks (GNNs) for the simulation
+of time-dependent PDEs in the presence of geometrical variability. In
+particular, we propose a systematic strategy to build surrogate models based on
+a data-driven time-stepping scheme where a GNN architecture is used to
+efficiently evolve the system. With respect to the majority of surrogate
+models, the proposed approach stands out for its ability of tackling problems
+with parameter dependent spatial domains, while simultaneously generalizing to
+different geometries and mesh resolutions. We assess the effectiveness of the
+proposed approach through a series of numerical experiments, involving both
+two- and three-dimensional problems, showing that GNNs can provide a valid
+alternative to traditional surrogate models in terms of computational
+efficiency and generalization to new scenarios. We also assess, from a
+numerical standpoint, the importance of using GNNs, rather than classical dense
+deep neural networks, for the proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Representation Learning for Time Series: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianwen Meng, Hangwei Qian, Yong Liu, Yonghui Xu, Zhiqi Shen, Lizhen Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised representation learning approaches aim to learn discriminative
+feature representations from unlabeled data, without the requirement of
+annotating every sample. Enabling unsupervised representation learning is
+extremely crucial for time series data, due to its unique annotation bottleneck
+caused by its complex characteristics and lack of visual cues compared with
+other data modalities. In recent years, unsupervised representation learning
+techniques have advanced rapidly in various domains. However, there is a lack
+of systematic analysis of unsupervised representation learning approaches for
+time series. To fill the gap, we conduct a comprehensive literature review of
+existing rapidly evolving unsupervised representation learning approaches for
+time series. Moreover, we also develop a unified and standardized library,
+named ULTS (i.e., Unsupervised Learning for Time Series), to facilitate fast
+implementations and unified evaluations on various models. With ULTS, we
+empirically evaluate state-of-the-art approaches, especially the rapidly
+evolving contrastive learning methods, on 9 diverse real-world datasets. We
+further discuss practical considerations as well as open research challenges on
+unsupervised representation learning for time series to facilitate future
+research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In submission to IEEE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Training of Denoising Diffusion Model Using Dual
+  Discriminators for High-Fidelity Multi-Speaker TTS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myeongjin Ko, Yong-Hoon Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion model is capable of generating high-quality data through a
+probabilistic approach. However, it suffers from the drawback of slow
+generation speed due to the requirement of a large number of time steps. To
+address this limitation, recent models such as denoising diffusion implicit
+models (DDIM) focus on generating samples without directly modeling the
+probability distribution, while models like denoising diffusion generative
+adversarial networks (GAN) combine diffusion processes with GANs. In the field
+of speech synthesis, a recent diffusion speech synthesis model called
+DiffGAN-TTS, utilizing the structure of GANs, has been introduced and
+demonstrates superior performance in both speech quality and generation speed.
+In this paper, to further enhance the performance of DiffGAN-TTS, we propose a
+speech synthesis model with two discriminators: a diffusion discriminator for
+learning the distribution of the reverse process and a spectrogram
+discriminator for learning the distribution of the generated data. Objective
+metrics such as structural similarity index measure (SSIM), mel-cepstral
+distortion (MCD), F0 root mean squared error (F0 RMSE), short-time objective
+intelligibility (STOI), perceptual evaluation of speech quality (PESQ), as well
+as subjective metrics like mean opinion score (MOS), are used to evaluate the
+performance of the proposed model. The evaluation results show that the
+proposed model outperforms recent state-of-the-art models such as FastSpeech2
+and DiffGAN-TTS in various metrics. Our implementation and audio samples are
+located on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Slate Policy Optimization: Going Beyond Plackett-Luce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Otmane Sakhi, David Rohde, Nicolas Chopin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An increasingly important building block of large scale machine learning
+systems is based on returning slates; an ordered lists of items given a query.
+Applications of this technology include: search, information retrieval and
+recommender systems. When the action space is large, decision systems are
+restricted to a particular structure to complete online queries quickly. This
+paper addresses the optimization of these large scale decision systems given an
+arbitrary reward function. We cast this learning problem in a policy
+optimization framework and propose a new class of policies, born from a novel
+relaxation of decision functions. This results in a simple, yet efficient
+learning algorithm that scales to massive action spaces. We compare our method
+to the commonly adopted Plackett-Luce policy class and demonstrate the
+effectiveness of our approach on problems with action space sizes in the order
+of millions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Federated Learning in Wireless Networks: Pruning Tackles
+  Bandwidth Scarcity and System Heterogeneity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01562v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01562v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Ferdous Pervej, Richeng Jin, Huaiyu Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a practical wireless network has many tiers where end users do not
+directly communicate with the central server, the users' devices have limited
+computation and battery powers, and the serving base station (BS) has a fixed
+bandwidth. Owing to these practical constraints and system models, this paper
+leverages model pruning and proposes a pruning-enabled hierarchical federated
+learning (PHFL) in heterogeneous networks (HetNets). We first derive an upper
+bound of the convergence rate that clearly demonstrates the impact of the model
+pruning and wireless communications between the clients and the associated BS.
+Then we jointly optimize the model pruning ratio, central processing unit (CPU)
+frequency and transmission power of the clients in order to minimize the
+controllable terms of the convergence bound under strict delay and energy
+constraints. However, since the original problem is not convex, we perform
+successive convex approximation (SCA) and jointly optimize the parameters for
+the relaxed convex problem. Through extensive simulation, we validate the
+effectiveness of our proposed PHFL algorithm in terms of test accuracy, wall
+clock time, energy consumption and bandwidth requirement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review for possible publications in IEEE TWC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion Planning Diffusion: Learning and Planning of Robot Motions with
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01557v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01557v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joao Carvalho, An T. Le, Mark Baierl, Dorothea Koert, Jan Peters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning priors on trajectory distributions can help accelerate robot motion
+planning optimization. Given previously successful plans, learning trajectory
+generative models as priors for a new planning problem is highly desirable.
+Prior works propose several ways on utilizing this prior to bootstrapping the
+motion planning problem. Either sampling the prior for initializations or using
+the prior distribution in a maximum-a-posterior formulation for trajectory
+optimization. In this work, we propose learning diffusion models as priors. We
+then can sample directly from the posterior trajectory distribution conditioned
+on task goals, by leveraging the inverse denoising process of diffusion models.
+Furthermore, diffusion has been recently shown to effectively encode data
+multimodality in high-dimensional settings, which is particularly well-suited
+for large trajectory dataset. To demonstrate our method efficacy, we compare
+our proposed method - Motion Planning Diffusion - against several baselines in
+simulated planar robot and 7-dof robot arm manipulator environments. To assess
+the generalization capabilities of our method, we test it in environments with
+previously unseen obstacles. Our experiments show that diffusion models are
+strong priors to encode high-dimensional trajectory distributions of robot
+motions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InterAct: Exploring the Potentials of Chat<span class="highlight-title">GPT</span> as a Cooperative Agent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-Lin Chen, Cheng-Shang Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper delves into the integration of OpenAI's ChatGPT into
+embodied agent systems, evaluating its influence on interactive decision-making
+benchmark. Drawing a parallel to the concept of people assuming roles according
+to their unique strengths, we introduce InterAct. In this approach, we feed
+ChatGPT with varied prompts, assigning it a numerous roles like a checker and a
+sorter, then integrating them with the original language model. Our research
+shows a remarkable success rate of 98% in AlfWorld, which consists of 6
+different tasks in a simulated household environment, emphasizing the
+significance of proficient prompt engineering. The results highlight ChatGPT's
+competence in comprehending and performing intricate tasks effectively in
+real-world settings, thus paving the way for further advancements in task
+planning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MusicLDM: Enhancing Novelty in Text-to-Music Generation Using
+  Beat-Synchronous Mixup Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Chen, Yusong Wu, Haohe Liu, Marianna Nezhurina, Taylor Berg-Kirkpatrick, Shlomo Dubnov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have shown promising results in cross-modal generation
+tasks, including text-to-image and text-to-audio generation. However,
+generating music, as a special type of audio, presents unique challenges due to
+limited availability of music data and sensitive issues related to copyright
+and plagiarism. In this paper, to tackle these challenges, we first construct a
+state-of-the-art text-to-music model, MusicLDM, that adapts Stable Diffusion
+and AudioLDM architectures to the music domain. We achieve this by retraining
+the contrastive language-audio pretraining model (CLAP) and the Hifi-GAN
+vocoder, as components of MusicLDM, on a collection of music data samples.
+Then, to address the limitations of training data and to avoid plagiarism, we
+leverage a beat tracking model and propose two different mixup strategies for
+data augmentation: beat-synchronous audio mixup and beat-synchronous latent
+mixup, which recombine training audio directly or via a latent embeddings
+space, respectively. Such mixup strategies encourage the model to interpolate
+between musical training samples and generate new music within the convex hull
+of the training data, making the generated music more diverse while still
+staying faithful to the corresponding style. In addition to popular evaluation
+metrics, we design several new evaluation metrics based on CLAP score to
+demonstrate that our proposed MusicLDM and beat-synchronous mixup strategies
+improve both the quality and novelty of generated music, as well as the
+correspondence between input text and generated music.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures, 2 tables, demo page: https://musicldm.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lode Enhancer: Level Co-creation Through Scaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debosmita Bhaumik, Julian Togelius, Georgios N. Yannakakis, Ahmed Khalifa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore AI-powered upscaling as a design assistance tool in the context of
+creating 2D game levels. Deep neural networks are used to upscale artificially
+downscaled patches of levels from the puzzle platformer game Lode Runner. The
+trained networks are incorporated into a web-based editor, where the user can
+create and edit levels at three different levels of resolution: 4x4, 8x8, and
+16x16. An edit at any resolution instantly transfers to the other resolutions.
+As upscaling requires inventing features that might not be present at lower
+resolutions, we train neural networks to reproduce these features. We introduce
+a neural network architecture that is capable of not only learning upscaling
+but also giving higher priority to less frequent tiles. To investigate the
+potential of this tool and guide further development, we conduct a qualitative
+study with 3 designers to understand how they use it. Designers enjoyed
+co-designing with the tool, liked its underlying concept, and provided feedback
+for further improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MFIM: Megapixel Facial Identity Manipulation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Na
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face swapping is a task that changes a facial identity of a given image to
+that of another person. In this work, we propose a novel face-swapping
+framework called Megapixel Facial Identity Manipulation (MFIM). The
+face-swapping model should achieve two goals. First, it should be able to
+generate a high-quality image. We argue that a model which is proficient in
+generating a megapixel image can achieve this goal. However, generating a
+megapixel image is generally difficult without careful model design. Therefore,
+our model exploits pretrained StyleGAN in the manner of GAN-inversion to
+effectively generate a megapixel image. Second, it should be able to
+effectively transform the identity of a given image. Specifically, it should be
+able to actively transform ID attributes (e.g., face shape and eyes) of a given
+image into those of another person, while preserving ID-irrelevant attributes
+(e.g., pose and expression). To achieve this goal, we exploit 3DMM that can
+capture various facial attributes. Specifically, we explicitly supervise our
+model to generate a face-swapped image with the desirable attributes using
+3DMM. We show that our model achieves state-of-the-art performance through
+extensive experiments. Furthermore, we propose a new operation called ID
+mixing, which creates a new identity by semantically mixing the identities of
+several people. It allows the user to customize the new identity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECCV 2022 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Circumventing Concept Erasure Methods For Text-to-Image Generative
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh Pham, Kelly O. Marshall, Chinmay Hegde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generative models can produce photo-realistic images for an
+extremely broad range of concepts, and their usage has proliferated widely
+among the general public. On the flip side, these models have numerous
+drawbacks, including their potential to generate images featuring sexually
+explicit content, mirror artistic styles without permission, or even
+hallucinate (or deepfake) the likenesses of celebrities. Consequently, various
+methods have been proposed in order to "erase" sensitive concepts from
+text-to-image models. In this work, we examine five recently proposed concept
+erasure methods, and show that targeted concepts are not fully excised from any
+of these methods. Specifically, we leverage the existence of special learned
+word embeddings that can retrieve "erased" concepts from the sanitized models
+with no alterations to their weights. Our results highlight the brittleness of
+post hoc concept erasure methods, and call into question their use in the
+algorithmic toolkit for AI safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Minimax Optimal $Q$ Learning with Nearest Neighbors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puning Zhao, Lifeng Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  $Q$ learning is a popular model free reinforcement learning method. Most of
+existing works focus on analyzing $Q$ learning for finite state and action
+spaces. If the state space is continuous, then the original $Q$ learning method
+can not be directly used. A modification of the original $Q$ learning method
+was proposed in (Shah and Xie, 2018), which estimates $Q$ values with nearest
+neighbors. Such modification makes $Q$ learning suitable for continuous state
+space. (Shah and Xie, 2018) shows that the convergence rate of estimated $Q$
+function is $\tilde{O}(T^{-1/(d+3)})$, which is slower than the minimax lower
+bound $\tilde{\Omega}(T^{-1/(d+2)})$, indicating that this method is not
+efficient. This paper proposes two new $Q$ learning methods to bridge the gap
+of convergence rates in (Shah and Xie, 2018), with one of them being offline,
+while the other is online. Despite that we still use nearest neighbor approach
+to estimate $Q$ function, the algorithms are crucially different from (Shah and
+Xie, 2018). In particular, we replace the kernel nearest neighbor in
+discretized region with a direct nearest neighbor approach. Consequently, our
+approach significantly improves the convergence rate. Moreover, the time
+complexity is also significantly improved in high dimensional state spaces. Our
+analysis shows that both offline and online methods are minimax rate optimal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient neural supersampling on a novel gaming <span class="highlight-title">dataset</span> <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01483v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01483v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Mercier, Ruan Erasmus, Yashesh Savani, Manik Dhingra, Fatih Porikli, Guillaume Berger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time rendering for video games has become increasingly challenging due
+to the need for higher resolutions, framerates and photorealism. Supersampling
+has emerged as an effective solution to address this challenge. Our work
+introduces a novel neural algorithm for supersampling rendered content that is
+4 times more efficient than existing methods while maintaining the same level
+of accuracy. Additionally, we introduce a new dataset which provides auxiliary
+modalities such as motion vectors and depth generated using graphics rendering
+features like viewport jittering and mipmap biasing at different resolutions.
+We believe that this dataset fills a gap in the current dataset landscape and
+can serve as a valuable resource to help measure progress in the field and
+advance the state-of-the-art in super-resolution techniques for gaming content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online covariance estimation for stochastic gradient descent under
+  Markovian sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Roy, Krishnakumar Balasubramanian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the online overlapping batch-means covariance estimator for
+Stochastic Gradient Descent (SGD) under Markovian sampling. We show that the
+convergence rates of the covariance estimator are
+$O\big(\sqrt{d}\,n^{-1/8}(\log n)^{1/4}\big)$ and
+$O\big(\sqrt{d}\,n^{-1/8}\big)$ under state-dependent and state-independent
+Markovian sampling, respectively, with $d$ representing dimensionality and $n$
+denoting the number of observations or SGD iterations. Remarkably, these rates
+match the best-known convergence rate previously established for the
+independent and identically distributed ($\iid$) case by \cite{zhu2021online},
+up to logarithmic factors. Our analysis overcomes significant challenges that
+arise due to Markovian sampling, leading to the introduction of additional
+error terms and complex dependencies between the blocks of the batch-means
+covariance estimator. Moreover, we establish the convergence rate for the first
+four moments of the $\ell_2$ norm of the error of SGD dynamics under
+state-dependent Markovian data, which holds potential interest as an
+independent result. To validate our theoretical findings, we provide numerical
+illustrations to derive confidence intervals for SGD when training linear and
+logistic regression models under Markovian sampling. Additionally, we apply our
+approach to tackle the intriguing problem of strategic classification with
+logistic regression, where adversaries can adaptively modify features during
+the training process to increase their chances of being classified in a
+specific target class.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Model Adaptation for Continual Learning at the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary A. Daniels, Jun Hu, Michael Lomnitz, Phil Miller, Aswin Raghavan, Joe Zhang, Michael Piacentino, David Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most machine learning (ML) systems assume stationary and matching data
+distributions during training and deployment. This is often a false assumption.
+When ML models are deployed on real devices, data distributions often shift
+over time due to changes in environmental factors, sensor characteristics, and
+task-of-interest. While it is possible to have a human-in-the-loop to monitor
+for distribution shifts and engineer new architectures in response to these
+shifts, such a setup is not cost-effective. Instead, non-stationary automated
+ML (AutoML) models are needed. This paper presents the
+Encoder-Adaptor-Reconfigurator (EAR) framework for efficient continual learning
+under domain shifts. The EAR framework uses a fixed deep neural network (DNN)
+feature encoder and trains shallow networks on top of the encoder to handle
+novel data. The EAR framework is capable of 1) detecting when new data is
+out-of-distribution (OOD) by combining DNNs with hyperdimensional computing
+(HDC), 2) identifying low-parameter neural adaptors to adapt the model to the
+OOD data using zero-shot neural architecture search (ZS-NAS), and 3) minimizing
+catastrophic forgetting on previous tasks by progressively growing the neural
+architecture as needed and dynamically routing data through the appropriate
+adaptors and reconfigurators for handling domain-incremental and
+class-incremental continual learning. We systematically evaluate our approach
+on several benchmark datasets for domain adaptation and demonstrate strong
+performance compared to state-of-the-art algorithms for OOD detection and
+few-/zero-shot NAS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review w/ IEEE Transactions on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Target specification bias, counterfactual prediction, and algorithmic
+  fairness in healthcare <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eran Tal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias in applications of machine learning (ML) to healthcare is usually
+attributed to unrepresentative or incomplete data, or to underlying health
+disparities. This article identifies a more pervasive source of bias that
+affects the clinical utility of ML-enabled prediction tools: target
+specification bias. Target specification bias arises when the
+operationalization of the target variable does not match its definition by
+decision makers. The mismatch is often subtle, and stems from the fact that
+decision makers are typically interested in predicting the outcomes of
+counterfactual, rather than actual, healthcare scenarios. Target specification
+bias persists independently of data limitations and health disparities. When
+left uncorrected, it gives rise to an overestimation of predictive accuracy, to
+inefficient utilization of medical resources, and to suboptimal decisions that
+can harm patients. Recent work in metrology - the science of measurement -
+suggests ways of counteracting target specification bias and avoiding its
+harmful consequences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 2023 AAAI/ACM Conference on AI, Ethics, and
+  Society (AIES23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causality Guided Disentanglement for Cross-Platform Hate Speech
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paras Sheth, Tharindu Kumarage, Raha Moraffah, Aman Chadha, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media platforms, despite their value in promoting open discourse, are
+often exploited to spread harmful content. Current deep learning and natural
+language processing models used for detecting this harmful content overly rely
+on domain-specific terms affecting their capabilities to adapt to generalizable
+hate speech detection. This is because they tend to focus too narrowly on
+particular linguistic signals or the use of certain categories of words.
+Another significant challenge arises when platforms lack high-quality annotated
+data for training, leading to a need for cross-platform models that can adapt
+to different distribution shifts. Our research introduces a cross-platform hate
+speech detection model capable of being trained on one platform's data and
+generalizing to multiple unseen platforms. To achieve good generalizability
+across platforms, one way is to disentangle the input representations into
+invariant and platform-dependent features. We also argue that learning causal
+relationships, which remain constant across diverse environments, can
+significantly aid in understanding invariant representations in hate speech. By
+disentangling input into platform-dependent features (useful for predicting
+hate targets) and platform-independent features (used to predict the presence
+of hate), we learn invariant representations resistant to distribution shifts.
+These features are then used to predict hate speech across unseen platforms.
+Our extensive experiments across four platforms highlight our model's enhanced
+efficacy compared to existing state-of-the-art methods in detecting generalized
+hate speech.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Specious Sites: Tracking the Spread and Sway of Spurious News Stories at
+  Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hans W. A. Hanley, Deepak Kumar, Zakir Durumeric
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Misinformation, propaganda, and outright lies proliferate on the web, with
+some narratives having dangerous real-world consequences on public health,
+elections, and individual safety. However, despite the impact of
+misinformation, the research community largely lacks automated and programmatic
+approaches for tracking news narratives across online platforms. In this work,
+utilizing daily scrapes of 1,404 unreliable news websites, the large-language
+model MPNet, and DP-Means clustering, we introduce a system to automatically
+isolate and analyze the narratives spread within online ecosystems. Identifying
+55,301 narratives on these 1,404 websites, we describe the most prevalent
+narratives spread in 2022 and identify the most influential websites that
+originate and magnify narratives. Finally, we show how our system can be
+utilized to detect new narratives originating from unreliable news websites and
+aid fact-checkers like Politifact, Reuters, and AP News in more quickly
+addressing misinformation stories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Task Interference in Multi-Task Learning via Explicit Task
+  Routing with Non-Learnable Primitives <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuntao Ding, Zhichao Lu, Shangguang Wang, Ran Cheng, Vishnu Naresh Boddeti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-task learning (MTL) seeks to learn a single model to accomplish
+multiple tasks by leveraging shared information among the tasks. Existing MTL
+models, however, have been known to suffer from negative interference among
+tasks. Efforts to mitigate task interference have focused on either
+loss/gradient balancing or implicit parameter partitioning with partial
+overlaps among the tasks. In this paper, we propose ETR-NLP to mitigate task
+interference through a synergistic combination of non-learnable primitives
+(NLPs) and explicit task routing (ETR). Our key idea is to employ non-learnable
+primitives to extract a diverse set of task-agnostic features and recombine
+them into a shared branch common to all tasks and explicit task-specific
+branches reserved for each task. The non-learnable primitives and the explicit
+decoupling of learnable parameters into shared and task-specific ones afford
+the flexibility needed for minimizing task interference. We evaluate the
+efficacy of ETR-NLP networks for both image-level classification and
+pixel-level dense prediction MTL problems. Experimental results indicate that
+ETR-NLP significantly outperforms state-of-the-art baselines with fewer
+learnable parameters and similar FLOPs across all datasets. Code is available
+at this \href{https://github.com/zhichao-lu/etr-nlp-mtl}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Biometric Capacity of Generative Face Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishnu Naresh Boddeti, Gautam Sreekumar, Arun Ross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been tremendous progress in generating realistic faces with high
+fidelity over the past few years. Despite this progress, a crucial question
+remains unanswered: "Given a generative face model, how many unique identities
+can it generate?" In other words, what is the biometric capacity of the
+generative face model? A scientific basis for answering this question will
+benefit evaluating and comparing different generative face models and establish
+an upper bound on their scalability. This paper proposes a statistical approach
+to estimate the biometric capacity of generated face images in a hyperspherical
+feature space. We employ our approach on multiple generative models, including
+unconditional generators like StyleGAN, Latent Diffusion Model, and "Generated
+Photos," as well as DCFace, a class-conditional generator. We also estimate
+capacity w.r.t. demographic attributes such as gender and age. Our capacity
+estimates indicate that (a) under ArcFace representation at a false acceptance
+rate (FAR) of 0.1%, StyleGAN3 and DCFace have a capacity upper bound of
+$1.43\times10^6$ and $1.190\times10^4$, respectively; (b) the capacity reduces
+drastically as we lower the desired FAR with an estimate of $1.796\times10^4$
+and $562$ at FAR of 1% and 10%, respectively, for StyleGAN3; (c) there is no
+discernible disparity in the capacity w.r.t gender; and (d) for some generative
+models, there is an appreciable disparity in the capacity w.r.t age. Code is
+available at https://github.com/human-analysis/capacity-generative-face-models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate Neural Network Pruning Requires Rethinking Sparse Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Denis Kuznedelev, Eldar Kurtic, Eugenia Iofinova, Elias Frantar, Alexandra Peste, Dan Alistarh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining versions of deep neural networks that are both highly-accurate and
+highly-sparse is one of the main challenges in the area of model compression,
+and several high-performance pruning techniques have been investigated by the
+community. Yet, much less is known about the interaction between sparsity and
+the standard stochastic optimization techniques used for training sparse
+networks, and most existing work uses standard dense schedules and
+hyperparameters for training sparse networks. In this work, we examine the
+impact of high sparsity on model training using the standard computer vision
+and natural language processing sparsity benchmarks. We begin by showing that
+using standard dense training recipes for sparse training is suboptimal, and
+results in under-training. We provide new approaches for mitigating this issue
+for both sparse pre-training of vision models (e.g. ResNet50/ImageNet) and
+sparse fine-tuning of language models (e.g. BERT/GLUE), achieving
+state-of-the-art results in both settings in the high-sparsity regime, and
+providing detailed analyses for the difficulty of sparse training in both
+scenarios. Our work sets a new threshold in terms of the accuracies that can be
+achieved under high sparsity, and should inspire further research into
+improving sparse model training, to reach higher accuracies under high
+sparsity, but also to do so efficiently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Recklessness to Collaborative Filtering based Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Pérez-López, Fernando Ortega, Ángel González-Prieto, Jorge Dueñas-Lerín
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems that include some reliability measure of their
+predictions tend to be more conservative in forecasting, due to their
+constraint to preserve reliability. This leads to a significant drop in the
+coverage and novelty that these systems can provide. In this paper, we propose
+the inclusion of a new term in the learning process of matrix
+factorization-based recommender systems, called recklessness, which enables the
+control of the risk level desired when making decisions about the reliability
+of a prediction. Experimental results demonstrate that recklessness not only
+allows for risk regulation but also improves the quantity and quality of
+predictions provided by the recommender system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seasonality Based Reranking of E-commerce Autocomplete Using Natural
+  Language Queries <span class="chip">KDD'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prateek Verma, Shan Zhong, Xiaoyu Liu, Adithya Rajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query autocomplete (QAC) also known as typeahead, suggests list of complete
+queries as user types prefix in the search box. It is one of the key features
+of modern search engines specially in e-commerce. One of the goals of typeahead
+is to suggest relevant queries to users which are seasonally important. In this
+paper we propose a neural network based natural language processing (NLP)
+algorithm to incorporate seasonality as a signal and present end to end
+evaluation of the QAC ranking model. Incorporating seasonality into
+autocomplete ranking model can improve autocomplete relevance and business
+metric.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 6th Workshop on e-Commerce and NLP (ECNLP 6), KDD'23,
+  Long Beach, CA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Independence Tests with Finite Sample Guarantees for Synchronous
+  Stochastic Linear Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ambrus Tamás, Dániel Ágoston Bálint, Balázs Csanád Csáji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper introduces robust independence tests with non-asymptotically
+guaranteed significance levels for stochastic linear time-invariant systems,
+assuming that the observed outputs are synchronous, which means that the
+systems are driven by jointly i.i.d. noises. Our method provides bounds for the
+type I error probabilities that are distribution-free, i.e., the innovations
+can have arbitrary distributions. The algorithm combines confidence region
+estimates with permutation tests and general dependence measures, such as the
+Hilbert-Schmidt independence criterion and the distance covariance, to detect
+any nonlinear dependence between the observed systems. We also prove the
+consistency of our hypothesis tests under mild assumptions and demonstrate the
+ideas through the example of autoregressive systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Graphical Approach to Document Layout Analysis <span class="chip">ICDAR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jilin Wang, Michael Krumdick, Baojia Tong, Hamima Halim, Maxim Sokolov, Vadym Barda, Delphine Vendryes, Chris Tanner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document layout analysis (DLA) is the task of detecting the distinct,
+semantic content within a document and correctly classifying these items into
+an appropriate category (e.g., text, title, figure). DLA pipelines enable users
+to convert documents into structured machine-readable formats that can then be
+used for many useful downstream tasks. Most existing state-of-the-art (SOTA)
+DLA models represent documents as images, discarding the rich metadata
+available in electronically generated PDFs. Directly leveraging this metadata,
+we represent each PDF page as a structured graph and frame the DLA problem as a
+graph segmentation and classification problem. We introduce the Graph-based
+Layout Analysis Model (GLAM), a lightweight graph neural network competitive
+with SOTA models on two challenging DLA datasets - while being an order of
+magnitude smaller than existing models. In particular, the 4-million parameter
+GLAM model outperforms the leading 140M+ parameter computer vision-based model
+on 5 of the 11 classes on the DocLayNet dataset. A simple ensemble of these two
+models achieves a new state-of-the-art on DocLayNet, increasing mAP from 76.8
+to 80.8. Overall, GLAM is over 5 times more efficient than SOTA models, making
+GLAM a favorable engineering choice for DLA tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICDAR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FuNToM: Functional Modeling of RF Circuits Using a Neural Network
+  Assisted Two-Port Analysis Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Morteza Fayazi, Morteza Tavakoli Taba, Amirata Tabatabavakili, Ehsan Afshari, Ronald Dreslinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic synthesis of analog and Radio Frequency (RF) circuits is a trending
+approach that requires an efficient circuit modeling method. This is due to the
+expensive cost of running a large number of simulations at each synthesis
+cycle. Artificial intelligence methods are promising approaches for circuit
+modeling due to their speed and relative accuracy. However, existing approaches
+require a large amount of training data, which is still collected using
+simulation runs. In addition, such approaches collect a whole separate dataset
+for each circuit topology even if a single element is added or removed. These
+matters are only exacerbated by the need for post-layout modeling simulations,
+which take even longer. To alleviate these drawbacks, in this paper, we present
+FuNToM, a functional modeling method for RF circuits. FuNToM leverages the
+two-port analysis method for modeling multiple topologies using a single main
+dataset and multiple small datasets. It also leverages neural networks which
+have shown promising results in predicting the behavior of circuits. Our
+results show that for multiple RF circuits, in comparison to the
+state-of-the-art works, while maintaining the same accuracy, the required
+training data is reduced by 2.8x - 10.9x. In addition, FuNToM needs 176.8x -
+188.6x less time for collecting the training set in post-layout modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 13 figures, 8 tables, accepted on International Conference
+  on Computer-Aided Design (ICCAD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Maxout Network-based Feature Fusion and Political Tangent Search
+  Optimizer enabled Transfer Learning for Thalassemia Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02029v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02029v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hemn Barzan Abdalla, Awder Ahmed, Guoquan Li, Nasser Mustafa, Abdur Rashid Sangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thalassemia is a heritable blood disorder which is the outcome of a genetic
+defect causing lack of production of hemoglobin polypeptide chains. However,
+there is less understanding of the precise frequency as well as sharing in
+these areas. Knowing about the frequency of thalassemia occurrence and
+dependable mutations is thus a significant step in preventing, controlling, and
+treatment planning. Here, Political Tangent Search Optimizer based Transfer
+Learning (PTSO_TL) is introduced for thalassemia detection. Initially, input
+data obtained from a particular dataset is normalized in the data normalization
+stage. Quantile normalization is utilized in the data normalization stage, and
+the data are then passed to the feature fusion phase, in which Weighted
+Euclidean Distance with Deep Maxout Network (DMN) is utilized. Thereafter, data
+augmentation is performed using the oversampling method to increase data
+dimensionality. Lastly, thalassemia detection is carried out by TL, wherein a
+convolutional neural network (CNN) is utilized with hyperparameters from a
+trained model such as Xception. TL is tuned by PTSO, and the training algorithm
+PTSO is presented by merging of Political Optimizer (PO) and Tangent Search
+Algorithm (TSA). Furthermore, PTSO_TL obtained maximal precision, recall, and
+f-measure values of about 94.3%, 96.1%, and 95.2%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Representation Learning for Automatic Speech Recognition <span class="chip">ISCA</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guruprasad V Rames, Gopinath Chennupati, Milind Rao, Anit Kumar Sahu, Ariya Rastrow, Jasha Droppo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a privacy-preserving paradigm, allowing edge
+devices to learn collaboratively without sharing data. Edge devices like Alexa
+and Siri are prospective sources of unlabeled audio data that can be tapped to
+learn robust audio representations. In this work, we bring Self-supervised
+Learning (SSL) and FL together to learn representations for Automatic Speech
+Recognition respecting data privacy constraints. We use the speaker and chapter
+information in the unlabeled speech dataset, Libri-Light, to simulate non-IID
+speaker-siloed data distributions and pre-train an LSTM encoder with the
+Contrastive Predictive Coding framework with FedSGD. We show that the
+pre-trained ASR encoder in FL performs as well as a centrally pre-trained model
+and produces an improvement of 12-15% (WER) compared to no pre-training. We
+further adapt the federated pre-trained models to a new language, French, and
+show a 20% (WER) improvement over no pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ISCA SPSC Symposium 3rd Symposium on Security and Privacy
+  in Speech Communication, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory capacity of two layer neural networks with smooth activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Madden, Christos Thrampoulidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Determining the memory capacity of two-layer neural networks with m hidden
+neurons and input dimension d (i.e., md+m total trainable parameters), which
+refers to the largest size of general data the network can memorize, is a
+fundamental machine-learning question. For non-polynomial real analytic
+activation functions, such as sigmoids and smoothed rectified linear units
+(smoothed ReLUs), we establish a lower bound of md/2 and optimality up to a
+factor of approximately 2. Analogous prior results were limited to Heaviside
+and ReLU activations, with results for smooth activations suffering from
+logarithmic factors and requiring random data. To analyze the memory capacity,
+we examine the rank of the network's Jacobian by computing the rank of matrices
+involving both Hadamard powers and the Khati-Rao product. Our computation
+extends classical linear algebraic facts about the rank of Hadamard powers.
+Overall, our approach differs from previous works on memory capacity and holds
+promise for extending to deeper models and other architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Transition from Neural Representation to Symbolic Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyan Cheng, Peter Chin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bridging the huge disparity between neural and symbolic representation can
+potentially enable the incorporation of symbolic thinking into neural networks
+from essence. Motivated by how human gradually builds complex symbolic
+representation from the prototype symbols that are learned through perception
+and environmental interactions. We propose a Neural-Symbolic Transitional
+Dictionary Learning (TDL) framework that employs an EM algorithm to learn a
+transitional representation of data that compresses high-dimension information
+of visual parts of an input into a set of tensors as neural variables and
+discover the implicit predicate structure in a self-supervised way. We
+implement the framework with a diffusion model by regarding the decomposition
+of input as a cooperative game, then learn predicates by prototype clustering.
+We additionally use RL enabled by the Markovian of diffusion models to further
+tune the learned prototypes by incorporating subjective factors. Extensive
+experiments on 3 abstract compositional visual objects datasets that require
+the model to segment parts without any visual features like texture, color, or
+shadows apart from shape and 3 neural/symbolic downstream tasks demonstrate the
+learned representation enables interpretable decomposition of visual input and
+smooth adaption to downstream tasks which are not available by existing
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable unsupervised multi-modal image registration using deep
+  networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengjia Wang, Giorgos Papanastasiou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical decision making from magnetic resonance imaging (MRI) combines
+complementary information from multiple MRI sequences (defined as
+'modalities'). MRI image registration aims to geometrically 'pair' diagnoses
+from different modalities, time points and slices. Both intra- and
+inter-modality MRI registration are essential components in clinical MRI
+settings. Further, an MRI image processing pipeline that can address both afine
+and non-rigid registration is critical, as both types of deformations may be
+occuring in real MRI data scenarios. Unlike image classification,
+explainability is not commonly addressed in image registration deep learning
+(DL) methods, as it is challenging to interpet model-data behaviours against
+transformation fields. To properly address this, we incorporate Grad-CAM-based
+explainability frameworks in each major component of our unsupervised
+multi-modal and multi-organ image registration DL methodology. We previously
+demonstrated that we were able to reach superior performance (against the
+current standard Syn method). In this work, we show that our DL model becomes
+fully explainable, setting the framework to generalise our approach on further
+medical imaging data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CartiMorph: a framework for automated knee articular cartilage
+  morphometrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcheng Yao, Junru Zhong, Liping Zhang, Sheheryar Khan, Weitian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce CartiMorph, a framework for automated knee articular cartilage
+morphometrics. It takes an image as input and generates quantitative metrics
+for cartilage subregions, including the percentage of full-thickness cartilage
+loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the
+power of deep learning models for hierarchical image feature representation.
+Deep learning models were trained and validated for tissue segmentation,
+template construction, and template-to-image registration. We established
+methods for surface-normal-based cartilage thickness mapping, FCL estimation,
+and rule-based cartilage parcellation. Our cartilage thickness map showed less
+error in thin and peripheral regions. We evaluated the effectiveness of the
+adopted segmentation model by comparing the quantitative metrics obtained from
+model segmentation and those from manual segmentation. The root-mean-squared
+deviation of the FCL measurements was less than 8%, and strong correlations
+were observed for the mean thickness (Pearson's correlation coefficient $\rho
+\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in
+[0.89,0.98]$) measurements. We compared our FCL measurements with those from a
+previous study and found that our measurements deviated less from the ground
+truths. We observed superior performance of the proposed rule-based cartilage
+parcellation method compared with the atlas-based approach. CartiMorph has the
+potential to promote imaging biomarkers discovery for knee osteoarthritis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Medical Image Analysis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain specificity and data efficiency in typo tolerant spell checkers:
+  the case of search in online marketplaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dayananda Ubrangala, Juhi Sharma, Ravi Prasad Kondapalli, Kiran R, Amit Agarwala, Laurent Boué
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typographical errors are a major source of frustration for visitors of online
+marketplaces. Because of the domain-specific nature of these marketplaces and
+the very short queries users tend to search for, traditional spell cheking
+solutions do not perform well in correcting typos. We present a data
+augmentation method to address the lack of annotated typo data and train a
+recurrent neural network to learn context-limited domain-specific embeddings.
+Those embeddings are deployed in a real-time inferencing API for the Microsoft
+AppSource marketplace to find the closest match between a misspelled user query
+and the available product names. Our data efficient solution shows that
+controlled high quality synthetic data may be a powerful tool especially
+considering the current climate of large language models which rely on
+prohibitively huge and often uncontrolled datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bringing Chemistry to Scale: Loss Weight Adjustment for Multivariate
+  Regression in Deep Learning of Thermochemical Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franz M. Rohrhofer, Stefan Posch, Clemens Gößnitzer, José M. García-Oliver, Bernhard C. Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flamelet models are widely used in computational fluid dynamics to simulate
+thermochemical processes in turbulent combustion. These models typically employ
+memory-expensive lookup tables that are predetermined and represent the
+combustion process to be simulated. Artificial neural networks (ANNs) offer a
+deep learning approach that can store this tabular data using a small number of
+network weights, potentially reducing the memory demands of complex simulations
+by orders of magnitude. However, ANNs with standard training losses often
+struggle with underrepresented targets in multivariate regression tasks, e.g.,
+when learning minor species mass fractions as part of lookup tables. This paper
+seeks to improve the accuracy of an ANN when learning multiple species mass
+fractions of a hydrogen (\ce{H2}) combustion lookup table. We assess a simple,
+yet effective loss weight adjustment that outperforms the standard mean-squared
+error optimization and enables accurate learning of all species mass fractions,
+even of minor species where the standard optimization completely fails.
+Furthermore, we find that the loss weight adjustment leads to more balanced
+gradients in the network training, which explains its effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages. Part of Scientific Computing 2023 Conference Proceedings
+  (ISBN e-Book: 978-3-903318-20-5)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Compositional Concepts Discovery with Text-to-Image
+  Generative Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05357v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05357v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Liu, Yilun Du, Shuang Li, Joshua B. Tenenbaum, Antonio Torralba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image generative models have enabled high-resolution image synthesis
+across different domains, but require users to specify the content they wish to
+generate. In this paper, we consider the inverse problem -- given a collection
+of different images, can we discover the generative concepts that represent
+each image? We present an unsupervised approach to discover generative concepts
+from a collection of images, disentangling different art styles in paintings,
+objects, and lighting from kitchen scenes, and discovering image classes given
+ImageNet images. We show how such generative concepts can accurately represent
+the content of images, be recombined and composed to generate new artistic and
+hybrid images, and be further used as a representation for downstream
+classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project Webpage:
+  https://energy-based-model.github.io/unsupervised-concept-discovery/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distributed Online Private Learning of Convex Nondecomposable Objectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07944v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07944v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huqiang Cheng, Xiaofeng Liao, Huaqing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We deal with a general distributed constrained online learning problem with
+privacy over time-varying networks, where a class of nondecomposable objectives
+are considered. Under this setting, each node only controls a part of the
+global decision, and the goal of all nodes is to collaboratively minimize the
+global cost over a time horizon $T$ while guarantees the security of the
+transmitted information. For such problems, we first design a novel generic
+algorithm framework, named as DPSDA, of differentially private distributed
+online learning using the Laplace mechanism and the stochastic variants of dual
+averaging method. Note that in the dual updates, all nodes of DPSDA employ the
+noise-corrupted gradients for more generality. Then, we propose two algorithms,
+named as DPSDA-C and DPSDA-PS, under this framework. In DPSDA-C, the nodes
+implement a circulation-based communication in the primal updates so as to
+alleviate the disagreements over time-varying undirected networks. In addition,
+for the extension to time-varying directed ones, the nodes implement the
+broadcast-based push-sum dynamics in DPSDA-PS, which can achieve average
+consensus over arbitrary directed networks. Theoretical results show that both
+algorithms attain an expected regret upper bound in $\mathcal{O}( \sqrt{T} )$
+when the objective function is convex, which matches the best utility
+achievable by cutting-edge algorithms. Finally, numerical experiment results on
+both synthetic and real-world datasets verify the effectiveness of our
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Masked Diffusion Models Are Fast and Privacy-Aware Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11363v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11363v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Lei, Peng Cheng, Zhongjie Ba, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as the \emph{de-facto} technique for image
+generation, yet they entail significant computational overhead, hindering the
+technique's broader application in the research community. We propose a
+prior-based denoising training framework, the first to incorporate the
+pre-train and fine-tune paradigm into the diffusion model training process,
+which substantially improves training efficiency and shows potential in
+facilitating various downstream tasks. Our approach centers on masking a high
+proportion (e.g., up to 90\%) of the input image and employing masked denoising
+score matching to denoise the visible areas, thereby guiding the diffusion
+model to learn more salient features from training data as prior knowledge. By
+utilizing masked learning in a pre-training stage, we efficiently train the
+ViT-based diffusion model on CelebA-HQ $256 \times 256$ in the pixel space,
+achieving a 4x acceleration and enhancing the quality of generated images
+compared to denoising diffusion probabilistic model (DDPM). Moreover, our
+masked pre-training technique can be universally applied to various diffusion
+models that directly generate images in the pixel space, aiding in the learning
+of pre-trained models with superior generalizability. For instance, a diffusion
+model pre-trained on VGGFace2 attains a 46\% quality improvement through
+fine-tuning with merely 10\% data from a different distribution. Moreover, our
+method shows the potential to serve as a training paradigm for enhancing the
+privacy protection capabilities of diffusion models. Our code is available at
+\url{https://github.com/jiachenlei/maskdm}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No Agreement Without Loss: Learning and Social Choice in Peer <span class="highlight-title">Review</span> <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.02144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.02144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Barceló, Mauricio Duarte, Cristóbal Rojas, Tomasz Steifer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In peer review systems, reviewers are often asked to evaluate various
+features of submissions, such as technical quality or novelty. A score is given
+to each of the predefined features and based on these the reviewer has to
+provide an overall quantitative recommendation. It may be assumed that each
+reviewer has her own mapping from the set of features to a recommendation, and
+that different reviewers have different mappings in mind. This introduces an
+element of arbitrariness known as commensuration bias. In this paper we discuss
+a framework, introduced by Noothigattu, Shah and Procaccia, and then applied by
+the organizers of the AAAI 2022 conference. Noothigattu, Shah and Procaccia
+proposed to aggregate reviewer's mapping by minimizing certain loss functions,
+and studied axiomatic properties of this approach, in the sense of social
+choice theory. We challenge several of the results and assumptions used in
+their work and report a number of negative results. On the one hand, we study a
+trade-off between some of the axioms proposed and the ability of the method to
+properly capture agreements of the majority of reviewers. On the other hand, we
+show that dropping a certain unrealistic assumption has dramatic effects,
+including causing the method to be discontinuous.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted for ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mlinear: Rethink the Linear Model for Time-series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Li, Xiangxu Meng, Chuhao Chen, Jianing Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, significant advancements have been made in time-series forecasting
+research, with an increasing focus on analyzing the nature of time-series data,
+e.g, channel-independence (CI) and channel-dependence (CD), rather than solely
+focusing on designing sophisticated forecasting models. However, current
+research has primarily focused on either CI or CD in isolation, and the
+challenge of effectively combining these two opposing properties to achieve a
+synergistic effect remains an unresolved issue. In this paper, we carefully
+examine the opposing properties of CI and CD, and raise a practical question
+that has not been effectively answered, e.g.,"How to effectively mix the CI and
+CD properties of time series to achieve better predictive performance?" To
+answer this question, we propose Mlinear (MIX-Linear), a simple yet effective
+method based mainly on linear layers. The design philosophy of Mlinear mainly
+includes two aspects:(1) dynamically tuning the CI and CD properties based on
+the time semantics of different input time series, and (2) providing deep
+supervision to adjust the individual performance of the "CI predictor" and "CD
+predictor". In addition, empirically, we introduce a new loss function that
+significantly outperforms the widely used mean squared error (MSE) on multiple
+datasets. Experiments on time-series datasets covering multiple fields and
+widely used have demonstrated the superiority of our method over PatchTST which
+is the lateset Transformer-based method in terms of the MSE and MAE metrics on
+7 datasets with identical sequence inputs (336 or 512). Specifically, our
+method significantly outperforms PatchTST with a ratio of 21:3 at 336 sequence
+length input and 29:10 at 512 sequence length input. Additionally, our approach
+has a 10 $\times$ efficiency advantage at the unit level, taking into account
+both training and inference times.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages,4 figure,7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Matrix Estimation for Individual Fairness <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02096v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02096v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cindy Y. Zhang, Sarah H. Cen, Devavrat Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, multiple notions of algorithmic fairness have arisen. One
+such notion is individual fairness (IF), which requires that individuals who
+are similar receive similar treatment. In parallel, matrix estimation (ME) has
+emerged as a natural paradigm for handling noisy data with missing values. In
+this work, we connect the two concepts. We show that pre-processing data using
+ME can improve an algorithm's IF without sacrificing performance. Specifically,
+we show that using a popular ME method known as singular value thresholding
+(SVT) to pre-process the data provides a strong IF guarantee under appropriate
+conditions. We then show that, under analogous conditions, SVT pre-processing
+also yields estimates that are consistent and approximately minimax optimal. As
+such, the ME pre-processing step does not, under the stated conditions,
+increase the prediction error of the base algorithm, i.e., does not impose a
+fairness-performance trade-off. We verify these results on synthetic and real
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 3 figures, ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Discovery from Temporal Data: An <span class="highlight-title">Overview</span> and New Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Gong, Di Yao, Chuzhe Zhang, Wenbin Li, Jingping Bi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal data, representing chronological observations of complex systems,
+has always been a typical data structure that can be widely generated by many
+domains, such as industry, medicine and finance. Analyzing this type of data is
+extremely valuable for various applications. Thus, different temporal data
+analysis tasks, eg, classification, clustering and prediction, have been
+proposed in the past decades. Among them, causal discovery, learning the causal
+relations from temporal data, is considered an interesting yet critical task
+and has attracted much research attention. Existing causal discovery works can
+be divided into two highly correlated categories according to whether the
+temporal data is calibrated, ie, multivariate time series causal discovery, and
+event sequence causal discovery. However, most previous surveys are only
+focused on the time series causal discovery and ignore the second category. In
+this paper, we specify the correlation between the two categories and provide a
+systematical overview of existing solutions. Furthermore, we provide public
+datasets, evaluation metrics and new perspectives for temporal data causal
+discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Random Planted Forest: a directly interpretable tree ensemble 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2012.14563v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2012.14563v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Munir Hiabu, Enno Mammen, Joseph T. Meyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel interpretable tree based algorithm for prediction in a
+regression setting. Our motivation is to estimate the unknown regression
+function from a functional decomposition perspective in which the functional
+components correspond to lower order interaction terms. The idea is to modify
+the random forest algorithm by keeping certain leaves after they are split
+instead of deleting them. This leads to non-binary trees which we refer to as
+planted trees. An extension to a forest leads to our random planted forest
+algorithm. Additionally, the maximum number of covariates which can interact
+within a leaf can be bounded. If we set this interaction bound to one, the
+resulting estimator is a sum of one-dimensional functions. In the other extreme
+case, if we do not set a limit, the resulting estimator and corresponding model
+place no restrictions on the form of the regression function. In a simulation
+study we find encouraging prediction and visualisation properties of our random
+planted forest method. We also develop theory for an idealized version of
+random planted forests in cases where the interaction bound is low. We show
+that if it is smaller than three, the idealized version achieves asymptotically
+optimal convergence rates up to a logarithmic factor. Code is available on
+GitHub https://github.com/PlantedML/randomPlantedForest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Merging satellite and gauge-measured precipitation using LightGBM with
+  an emphasis on extreme quantiles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03606v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03606v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hristos Tyralis, Georgia Papacharalampous, Nikolaos Doulamis, Anastasios Doulamis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowing the actual precipitation in space and time is critical in
+hydrological modelling applications, yet the spatial coverage with rain gauge
+stations is limited due to economic constraints. Gridded satellite
+precipitation datasets offer an alternative option for estimating the actual
+precipitation by covering uniformly large areas, albeit related estimates are
+not accurate. To improve precipitation estimates, machine learning is applied
+to merge rain gauge-based measurements and gridded satellite precipitation
+products. In this context, observed precipitation plays the role of the
+dependent variable, while satellite data play the role of predictor variables.
+Random forests is the dominant machine learning algorithm in relevant
+applications. In those spatial predictions settings, point predictions (mostly
+the mean or the median of the conditional distribution) of the dependent
+variable are issued. The aim of the manuscript is to solve the problem of
+probabilistic prediction of precipitation with an emphasis on extreme quantiles
+in spatial interpolation settings. Here we propose, issuing probabilistic
+spatial predictions of precipitation using Light Gradient Boosting Machine
+(LightGBM). LightGBM is a boosting algorithm, highlighted by prize-winning
+entries in prediction and forecasting competitions. To assess LightGBM, we
+contribute a large-scale application that includes merging daily precipitation
+measurements in contiguous US with PERSIANN and GPM-IMERG satellite
+precipitation data. We focus on extreme quantiles of the probability
+distribution of the dependent variable, where LightGBM outperforms quantile
+regression forests (QRF, a variant of random forests) in terms of quantile
+score at extreme quantiles. Our study offers understanding of probabilistic
+predictions in spatial settings using machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shehzaad Dhuliawala, Mrinmaya Sachan, Carl Allen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a latent variable generalisation of neural network softmax
+classification trained with cross-entropy loss, referred to as variational
+classification (VC). Our approach offers a novel probabilistic perspective on
+the highly familiar softmax classification model, to which it relates similarly
+to how variational and traditional autoencoders relate. We derive a training
+objective based on the evidence lower bound (ELBO) that is non-trivial to
+optimize, and therefore propose an adversarial approach to maximise it. We show
+that VC addresses an inherent inconsistency within softmax classification,
+whilst also allowing more flexible choices of prior distributions in the latent
+space in place of implicit assumptions revealed within off-the-shelf softmax
+classifiers. Empirical evaluation on image and text classification datasets
+demonstrates that variational classification maintains prediction accuracy
+while improving other desirable properties such as calibration and adversarial
+robustness, particularly under distribution shift and low data settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAB: Provable Robustness Against Backdoor Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.08904v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.08904v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maurice Weber, Xiaojun Xu, Bojan Karlaš, Ce Zhang, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have shown that deep neural networks (DNNs) are vulnerable to
+adversarial attacks, including evasion and backdoor (poisoning) attacks. On the
+defense side, there have been intensive efforts on improving both empirical and
+provable robustness against evasion attacks; however, the provable robustness
+against backdoor attacks still remains largely unexplored. In this paper, we
+focus on certifying the machine learning model robustness against general
+threat models, especially backdoor attacks. We first provide a unified
+framework via randomized smoothing techniques and show how it can be
+instantiated to certify the robustness against both evasion and backdoor
+attacks. We then propose the first robust training process, RAB, to smooth the
+trained model and certify its robustness against backdoor attacks. We prove the
+robustness bound for machine learning models trained with RAB and prove that
+our robustness bound is tight. In addition, we theoretically show that it is
+possible to train the robust smoothed models efficiently for simple models such
+as K-nearest neighbor classifiers, and we propose an exact smooth-training
+algorithm that eliminates the need to sample from a noise distribution for such
+models. Empirically, we conduct comprehensive experiments for different machine
+learning (ML) models such as DNNs, support vector machines, and K-NN models on
+MNIST, CIFAR-10, and ImageNette datasets and provide the first benchmark for
+certified robustness against backdoor attacks. In addition, we evaluate K-NN
+models on a spambase tabular dataset to demonstrate the advantages of the
+proposed exact algorithm. Both the theoretic analysis and the comprehensive
+evaluation on diverse ML models and datasets shed light on further robust
+learning strategies against general training time attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Symposium on Security and Privacy 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relational Experience Replay: Continual Learning by Adaptively Tuning
+  Task-wise Relationship 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.15402v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.15402v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanziang Wang, Renzhen Wang, Yuexiang Li, Dong Wei, Kai Ma, Yefeng Zheng, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning is a promising machine learning paradigm to learn new
+tasks while retaining previously learned knowledge over streaming training
+data. Till now, rehearsal-based methods, keeping a small part of data from old
+tasks as a memory buffer, have shown good performance in mitigating
+catastrophic forgetting for previously learned knowledge. However, most of
+these methods typically treat each new task equally, which may not adequately
+consider the relationship or similarity between old and new tasks. Furthermore,
+these methods commonly neglect sample importance in the continual training
+process and result in sub-optimal performance on certain tasks. To address this
+challenging problem, we propose Relational Experience Replay (RER), a bi-level
+learning framework, to adaptively tune task-wise relationships and sample
+importance within each task to achieve a better `stability' and `plasticity'
+trade-off. As such, the proposed method is capable of accumulating new
+knowledge while consolidating previously learned old knowledge during continual
+learning. Extensive experiments conducted on three publicly available datasets
+(i.e., CIFAR-10, CIFAR-100, and Tiny ImageNet) show that the proposed method
+can consistently improve the performance of all baselines and surpass current
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Latent Graph to Latent Topology Inference: Differentiable Cell
+  Complex Module 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudio Battiloro, Indro Spinelli, Lev Telyatnikov, Michael Bronstein, Simone Scardapane, Paolo Di Lorenzo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent Graph Inference (LGI) relaxed the reliance of Graph Neural Networks
+(GNNs) on a given graph topology by dynamically learning it. However, most of
+LGI methods assume to have a (noisy, incomplete, improvable, ...) input graph
+to rewire and can solely learn regular graph topologies. In the wake of the
+success of Topological Deep Learning (TDL), we study Latent Topology Inference
+(LTI) for learning higher-order cell complexes (with sparse and not regular
+topology) describing multi-way interactions between data points. To this aim,
+we introduce the Differentiable Cell Complex Module (DCM), a novel learnable
+function that computes cell probabilities in the complex to improve the
+downstream task. We show how to integrate DCM with cell complex message passing
+networks layers and train it in a end-to-end fashion, thanks to a two-step
+inference procedure that avoids an exhaustive search across all possible cells
+in the input, thus maintaining scalability. Our model is tested on several
+homophilic and heterophilic graph datasets and it is shown to outperform other
+state-of-the-art techniques, offering significant improvements especially in
+cases where an input graph is not provided.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review. 17 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Unsupervised Machine Learning Approach for Ground-Motion Spectra
+  Clustering and Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.03188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.03188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. Bailey Bond, Pu Ren, Jerome F. Hajjar, Hao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clustering analysis of sequence data continues to address many applications
+in engineering design, aided with the rapid growth of machine learning in
+applied science. This paper presents an unsupervised machine learning algorithm
+to extract defining characteristics of earthquake ground-motion spectra, also
+called latent features, to aid in ground-motion selection (GMS). In this
+context, a latent feature is a low-dimensional machine-discovered spectral
+characteristic learned through nonlinear relationships of a neural network
+autoencoder. Machine discovered latent features can be combined with
+traditionally defined intensity measures and clustering can be performed to
+select a representative subgroup from a large ground-motion suite. The
+objective of efficient GMS is to choose characteristic records representative
+of what the structure will probabilistically experience in its lifetime. Three
+examples are presented to validate this approach, including the use of
+synthetic and field recorded ground-motion datasets. The presented deep
+embedding clustering of ground-motion spectra has three main advantages: 1.
+defining characteristics the represent the sparse spectral content of
+ground-motions are discovered efficiently through training of the autoencoder,
+2. domain knowledge is incorporated into the machine learning framework with
+conditional variables in the deep embedding scheme, and 3. method exhibits
+excellent performance when compared to a benchmark seismic hazard analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 16 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain knowledge-informed Synthetic fault sample generation with Health
+  Data Map for cross-domain Planetary Gearbox Fault Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19569v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19569v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jong Moon Ha, Olga Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extensive research has been conducted on fault diagnosis of planetary
+gearboxes using vibration signals and deep learning (DL) approaches. However,
+DL-based methods are susceptible to the domain shift problem caused by varying
+operating conditions of the gearbox. Although domain adaptation and data
+synthesis methods have been proposed to overcome such domain shifts, they are
+often not directly applicable in real-world situations where only healthy data
+is available in the target domain. To tackle the challenge of extreme domain
+shift scenarios where only healthy data is available in the target domain, this
+paper proposes two novel domain knowledge-informed data synthesis methods
+utilizing the health data map (HDMap). The two proposed approaches are referred
+to as scaled CutPaste and FaultPaste. The HDMap is used to physically represent
+the vibration signal of the planetary gearbox as an image-like matrix, allowing
+for visualization of fault-related features. CutPaste and FaultPaste are then
+applied to generate faulty samples based on the healthy data in the target
+domain, using domain knowledge and fault signatures extracted from the source
+domain, respectively. In addition to generating realistic faults, the proposed
+methods introduce scaling of fault signatures for controlled synthesis of
+faults with various severity levels. A case study is conducted on a planetary
+gearbox testbed to evaluate the proposed approaches. The results show that the
+proposed methods are capable of accurately diagnosing faults, even in cases of
+extreme domain shift, and can estimate the severity of faults that have not
+been previously observed in the target domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review / added arXiv identifier / Updated to revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Training of Mean Variance Estimation Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08875v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08875v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurens Sluijterman, Eric Cator, Tom Heskes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focusses on the optimal implementation of a Mean Variance
+Estimation network (MVE network) (Nix and Weigend, 1994). This type of network
+is often used as a building block for uncertainty estimation methods in a
+regression setting, for instance Concrete dropout (Gal et al., 2017) and Deep
+Ensembles (Lakshminarayanan et al., 2017). Specifically, an MVE network assumes
+that the data is produced from a normal distribution with a mean function and
+variance function. The MVE network outputs a mean and variance estimate and
+optimizes the network parameters by minimizing the negative loglikelihood. In
+our paper, we present two significant insights. Firstly, the convergence
+difficulties reported in recent work can be relatively easily prevented by
+following the simple yet often overlooked recommendation from the original
+authors that a warm-up period should be used. During this period, only the mean
+is optimized with a fixed variance. We demonstrate the effectiveness of this
+step through experimentation, highlighting that it should be standard practice.
+As a sidenote, we examine whether, after the warm-up, it is beneficial to fix
+the mean while optimizing the variance or to optimize both simultaneously.
+Here, we do not observe a substantial difference. Secondly, we introduce a
+novel improvement of the MVE network: separate regularization of the mean and
+the variance estimate. We demonstrate, both on toy examples and on a number of
+benchmark UCI regression data sets, that following the original recommendations
+and the novel separate regularization can lead to significant improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Evaluate Uncertainty Estimates in Machine Learning for
+  Regression? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.03395v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.03395v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurens Sluijterman, Eric Cator, Tom Heskes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As neural networks become more popular, the need for accompanying uncertainty
+estimates increases. There are currently two main approaches to test the
+quality of these estimates. Most methods output a density. They can be compared
+by evaluating their loglikelihood on a test set. Other methods output a
+prediction interval directly. These methods are often tested by examining the
+fraction of test points that fall inside the corresponding prediction
+intervals. Intuitively both approaches seem logical. However, we demonstrate
+through both theoretical arguments and simulations that both ways of evaluating
+the quality of uncertainty estimates have serious flaws. Firstly, both
+approaches cannot disentangle the separate components that jointly create the
+predictive uncertainty, making it difficult to evaluate the quality of the
+estimates of these components. Secondly, a better loglikelihood does not
+guarantee better prediction intervals, which is what the methods are often used
+for in practice. Moreover, the current approach to test prediction intervals
+directly has additional flaws. We show why it is fundamentally flawed to test a
+prediction or confidence interval on a single test set. At best, marginal
+coverage is measured, implicitly averaging out overconfident and underconfident
+predictions. A much more desirable property is pointwise coverage, requiring
+the correct coverage for each prediction. We demonstrate through practical
+examples that these effects can result in favoring a method, based on the
+predictive uncertainty, that has undesirable behaviour of the confidence or
+prediction intervals. Finally, we propose a simulation-based testing approach
+that addresses these problems while still allowing easy comparison between
+different methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confident Neural Network Regression with Bootstrapped Deep Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.10903v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.10903v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurens Sluijterman, Eric Cator, Tom Heskes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise of the popularity and usage of neural networks, trustworthy
+uncertainty estimation is becoming increasingly essential. One of the most
+prominent uncertainty estimation methods is Deep Ensembles (Lakshminarayanan et
+al., 2017) . A classical parametric model has uncertainty in the parameters due
+to the fact that the data on which the model is build is a random sample. A
+modern neural network has an additional uncertainty component since the
+optimization of the network is random. Lakshminarayanan et al. (2017) noted
+that Deep Ensembles do not incorporate the classical uncertainty induced by the
+effect of finite data. In this paper, we present a computationally cheap
+extension of Deep Ensembles for the regression setting, called Bootstrapped
+Deep Ensembles, that explicitly takes this classical effect of finite data into
+account using a modified version of the parametric bootstrap. We demonstrate
+through an experimental study that our method significantly improves upon
+standard Deep Ensembles
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProMix: Combating Label Noise via Maximizing Clean Sample Utility <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10276v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10276v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixuan Xiao, Yiwen Dong, Haobo Wang, Lei Feng, Runze Wu, Gang Chen, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with Noisy Labels (LNL) has become an appealing topic, as
+imperfectly annotated data are relatively cheaper to obtain. Recent
+state-of-the-art approaches employ specific selection mechanisms to separate
+clean and noisy samples and then apply Semi-Supervised Learning (SSL)
+techniques for improved performance. However, the selection step mostly
+provides a medium-sized and decent-enough clean subset, which overlooks a rich
+set of clean samples. To fulfill this, we propose a novel LNL framework ProMix
+that attempts to maximize the utility of clean samples for boosted performance.
+Key to our method, we propose a matched high confidence selection technique
+that selects those examples with high confidence scores and matched predictions
+with given labels to dynamically expand a base clean sample set. To overcome
+the potential side effect of excessive clean set selection procedure, we
+further devise a novel SSL framework that is able to train balanced and
+unbiased classifiers on the separated clean and noisy samples. Extensive
+experiments demonstrate that ProMix significantly advances the current
+state-of-the-art results on multiple benchmarks with different types and levels
+of noise. It achieves an average improvement of 2.48\% on the CIFAR-N dataset.
+The code is available at https://github.com/Justherozen/ProMix
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCAI 2023; A previous version won the 1st LMNL Challenge
+  in IJCAI 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification and Online Clustering of Zero-Day Malware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00605v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00605v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olha Jurečková, Martin Jureček, Mark Stamp, Fabio Di Troia, Róbert Lórencz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A large amount of new malware is constantly being generated, which must not
+only be distinguished from benign samples, but also classified into malware
+families. For this purpose, investigating how existing malware families are
+developed and examining emerging families need to be explored. This paper
+focuses on the online processing of incoming malicious samples to assign them
+to existing families or, in the case of samples from new families, to cluster
+them. We experimented with seven prevalent malware families from the EMBER
+dataset, four in the training set and three additional new families in the test
+set. Based on the classification score of the multilayer perceptron, we
+determined which samples would be classified and which would be clustered into
+new malware families. We classified 97.21% of streaming data with a balanced
+accuracy of 95.33%. Then, we clustered the remaining data using a
+self-organizing map, achieving a purity from 47.61% for four clusters to 77.68%
+for ten clusters. These results indicate that our approach has the potential to
+be applied to the classification and clustering of zero-day malware into
+malware families.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in
+  Patient With Suspected Ischemic Stroke 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08757v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08757v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Tomasetti, Kjersti Engan, Liv Jorunn Høllesli, Kathinka Dæhli Kurz, Mahdieh Khanmohammadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Precise and fast prediction methods for ischemic areas comprised of dead
+tissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)
+patients are of significant clinical interest. They play an essential role in
+improving diagnosis and treatment planning. Computed Tomography (CT) scan is
+one of the primary modalities for early assessment in patients with suspected
+AIS. CT Perfusion (CTP) is often used as a primary assessment to determine
+stroke location, severity, and volume of ischemic lesions. Current automatic
+segmentation methods for CTP mostly use already processed 3D parametric maps
+conventionally used for clinical interpretation by radiologists as input.
+Alternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time
+input, where the spatial information over the volume is ignored. In addition,
+these methods are only interested in segmenting core regions, while predicting
+penumbra can be essential for treatment planning. This paper investigates
+different methods to utilize the entire 4D CTP as input to fully exploit the
+spatio-temporal information, leading us to propose a novel 4D convolution
+layer. Our comprehensive experiments on a local dataset of 152 patients divided
+into three groups show that our proposed models generate more precise results
+than other methods explored. Adopting the proposed 4D mJ-Net, a Dice
+Coefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core
+areas, respectively. The code is available on
+https://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Neural Network Warm-Start Approach for the Inverse Acoustic Obstacle
+  Scattering Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.08736v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.08736v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mo Zhou, Jiequn Han, Manas Rachh, Carlos Borges
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the inverse acoustic obstacle problem for sound-soft star-shaped
+obstacles in two dimensions wherein the boundary of the obstacle is determined
+from measurements of the scattered field at a collection of receivers outside
+the object. One of the standard approaches for solving this problem is to
+reformulate it as an optimization problem: finding the boundary of the domain
+that minimizes the $L^2$ distance between computed values of the scattered
+field and the given measurement data. The optimization problem is
+computationally challenging since the local set of convexity shrinks with
+increasing frequency and results in an increasing number of local minima in the
+vicinity of the true solution. In many practical experimental settings, low
+frequency measurements are unavailable due to limitations of the experimental
+setup or the sensors used for measurement. Thus, obtaining a good initial guess
+for the optimization problem plays a vital role in this environment.
+  We present a neural network warm-start approach for solving the inverse
+scattering problem, where an initial guess for the optimization problem is
+obtained using a trained neural network. We demonstrate the effectiveness of
+our method with several numerical examples. For high frequency problems, this
+approach outperforms traditional iterative methods such as Gauss-Newton
+initialized without any prior (i.e., initialized using a unit circle), or
+initialized using the solution of a direct method such as the linear sampling
+method. The algorithm remains robust to noise in the scattered field
+measurements and also converges to the true solution for limited aperture data.
+However, the number of training samples required to train the neural network
+scales exponentially in frequency and the complexity of the obstacles
+considered. We conclude with a discussion of this phenomenon and potential
+directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auxiliary Cross-Modal Representation Learning with Triplet Loss
+  Functions for Online Handwriting Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.07901v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.07901v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Ott, David Rügamer, Lucas Heublein, Bernd Bischl, Christopher Mutschler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal representation learning learns a shared embedding between two or
+more modalities to improve performance in a given task compared to using only
+one of the modalities. Cross-modal representation learning from different data
+types -- such as images and time-series data (e.g., audio or text data) --
+requires a deep metric learning loss that minimizes the distance between the
+modality embeddings. In this paper, we propose to use the contrastive or
+triplet loss, which uses positive and negative identities to create sample
+pairs with different labels, for cross-modal representation learning between
+image and time-series modalities (CMR-IS). By adapting the triplet loss for
+cross-modal representation learning, higher accuracy in the main (time-series
+classification) task can be achieved by exploiting additional information of
+the auxiliary (image classification) task. We present a triplet loss with a
+dynamic margin for single label and sequence-to-sequence classification tasks.
+We perform extensive evaluations on synthetic image and time-series data, and
+on data for offline handwriting recognition (HWR) and on online HWR from
+sensor-enhanced pens for classifying written words. Our experiments show an
+improved classification accuracy, faster convergence, and better
+generalizability due to an improved cross-modal representation. Furthermore,
+the more suitable generalizability leads to a better adaptability between
+writers for online HWR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLCA: Slow Learner with Classifier Alignment for Continual Learning on a
+  <span class="highlight-title">Pre-train</span>ed Model <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05118v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05118v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gengwei Zhang, Liyuan Wang, Guoliang Kang, Ling Chen, Yunchao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of continual learning is to improve the performance of recognition
+models in learning sequentially arrived data. Although most existing works are
+established on the premise of learning from scratch, growing efforts have been
+devoted to incorporating the benefits of pre-training. However, how to
+adaptively exploit the pre-trained knowledge for each incremental task while
+maintaining its generalizability remains an open question. In this work, we
+present an extensive analysis for continual learning on a pre-trained model
+(CLPM), and attribute the key challenge to a progressive overfitting problem.
+Observing that selectively reducing the learning rate can almost resolve this
+issue in the representation layer, we propose a simple but extremely effective
+approach named Slow Learner with Classifier Alignment (SLCA), which further
+improves the classification layer by modeling the class-wise distributions and
+aligning the classification layers in a post-hoc fashion. Across a variety of
+scenarios, our proposal provides substantial improvements for CLPM (e.g., up to
+49.76%, 50.05%, 44.69% and 40.16% on Split CIFAR-100, Split ImageNet-R, Split
+CUB-200 and Split Cars-196, respectively), and thus outperforms
+state-of-the-art approaches by a large margin. Based on such a strong baseline,
+critical factors and promising directions are analyzed in-depth to facilitate
+subsequent research. Code has been made available at:
+https://github.com/GengDavid/SLCA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, code released</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharing to learn and learning to share -- Fitting together
+  Meta-Learning, Multi-Task Learning, and Transfer Learning: A meta <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.12146v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.12146v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richa Upadhyay, Ronald Phlypo, Rajkumar Saini, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integrating knowledge across different domains is an essential feature of
+human learning. Learning paradigms such as transfer learning, meta learning,
+and multi-task learning reflect the human learning process by exploiting the
+prior knowledge for new tasks, encouraging faster learning and good
+generalization for new tasks. This article gives a detailed view of these
+learning paradigms and their comparative analysis. The weakness of one learning
+algorithm turns out to be a strength of another, and thus merging them is a
+prevalent trait in the literature. There are numerous research papers that
+focus on each of these learning paradigms separately and provide a
+comprehensive overview of them. However, this article provides a review of
+research studies that combine (two of) these learning algorithms. This survey
+describes how these techniques are combined to solve problems in many different
+fields of study, including computer vision, natural language processing,
+hyperspectral imaging, and many more, in supervised setting only. As a result,
+the global generic learning network an amalgamation of meta learning, transfer
+learning, and multi-task learning is introduced here, along with some open
+research questions and future research directions in the multi-task setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Morphological Classification of Extragalactic Radio Sources Using
+  Gradient Boosting Methods <span class="chip">IJCNN</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdollah Masoud Darya, Ilias Fernini, Marley Vellasco, Abir Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of radio astronomy is witnessing a boom in the amount of data
+produced per day due to newly commissioned radio telescopes. One of the most
+crucial problems in this field is the automatic classification of extragalactic
+radio sources based on their morphologies. Most recent contributions in the
+field of morphological classification of extragalactic radio sources have
+proposed classifiers based on convolutional neural networks. Alternatively,
+this work proposes gradient boosting machine learning methods accompanied by
+principal component analysis as data-efficient alternatives to convolutional
+neural networks. Recent findings have shown the efficacy of gradient boosting
+methods in outperforming deep learning methods for classification problems with
+tabular data. The gradient boosting methods considered in this work are based
+on the XGBoost, LightGBM, and CatBoost implementations. This work also studies
+the effect of dataset size on classifier performance. A three-class
+classification problem is considered in this work based on the three main
+Fanaroff-Riley classes: class 0, class I, and class II, using radio sources
+from the Best-Heckman sample. All three proposed gradient boosting methods
+outperformed a state-of-the-art convolutional neural networks-based classifier
+using less than a quarter of the number of images, with CatBoost having the
+highest accuracy. This was mainly due to the superior accuracy of gradient
+boosting methods in classifying Fanaroff-Riley class II sources, with
+3$\unicode{x2013}$4% higher recall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The peer-reviewed paper was presented at The 2023 International Joint
+  Conference on Neural Networks (IJCNN) and published on IEEE Xplore. The code
+  and dataset used in this work are available from
+  https://github.com/AbdollahMasoud/IJCNN-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from Data Streams: An <span class="highlight-title">Overview</span> and Update 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.14720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.14720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse Read, Indrė Žliobaitė
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The literature on machine learning in the context of data streams is vast and
+growing. However, many of the defining assumptions regarding data-stream
+learning tasks are too strong to hold in practice, or are even contradictory
+such that they cannot be met in the contexts of supervised learning. Algorithms
+are chosen and designed based on criteria which are often not clearly stated,
+for problem settings not clearly defined, tested in unrealistic settings,
+and/or in isolation from related approaches in the wider literature. This puts
+into question the potential for real-world impact of many approaches conceived
+in such contexts, and risks propagating a misguided research focus. We propose
+to tackle these issues by reformulating the fundamental definitions and
+settings of supervised data-stream learning with regard to contemporary
+considerations of concept drift and temporal dependence; and we take a fresh
+look at what constitutes a supervised data-stream learning task, and a
+reconsideration of algorithms that may be applied to tackle such tasks. Through
+and in reflection of this formulation and overview, helped by an informal
+survey of industrial players dealing with real-world data streams, we provide
+recommendations. Our main emphasis is that learning from data streams does not
+impose a single-pass or online-learning approach, or any particular learning
+regime; and any constraints on memory and time are not specific to streaming.
+Meanwhile, there exist established techniques for dealing with temporal
+dependence and concept drift, in other areas of the literature. For the data
+streams community, we thus encourage a shift in research focus, from dealing
+with often-artificial constraints and assumptions on the learning mode, to
+issues such as robustness, privacy, and interpretability which are increasingly
+relevant to learning in data streams in academic and industrial settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stable and consistent density-based clustering via multiparameter
+  persistence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2005.09048v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2005.09048v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Rolle, Luis Scoccola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the degree-Rips construction from topological data analysis,
+which provides a density-sensitive, multiparameter hierarchical clustering
+algorithm. We analyze its stability to perturbations of the input data using
+the correspondence-interleaving distance, a metric for hierarchical clusterings
+that we introduce. Taking certain one-parameter slices of degree-Rips recovers
+well-known methods for density-based clustering, but we show that these methods
+are unstable. However, we prove that degree-Rips, as a multiparameter object,
+is stable, and we propose an alternative approach for taking slices of
+degree-Rips, which yields a one-parameter hierarchical clustering algorithm
+with better stability properties. We prove that this algorithm is consistent,
+using the correspondence-interleaving distance. We provide an algorithm for
+extracting a single clustering from one-parameter hierarchical clusterings,
+which is stable with respect to the correspondence-interleaving distance. And,
+we integrate these methods into a pipeline for density-based clustering, which
+we call Persistable. Adapting tools from multiparameter persistent homology, we
+propose visualization tools that guide the selection of all parameters of the
+pipeline. We demonstrate Persistable on benchmark datasets, showing that it
+identifies multi-scale cluster structure in data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>68 pages, 16 figures. v3: major changes to exposition, significant
+  additions to content, some mathematical reformulations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Effective LSTM-DDPM Scheme for Energy Theft Detection and Forecasting
+  in Smart Grid 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16149v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16149v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xun Yuan, Yang Yang, Arwa Alromih, Prosanta Gope, Biplab Sikdar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy theft detection (ETD) and energy consumption forecasting (ECF) are two
+interconnected challenges in smart grid systems. Addressing these issues
+collectively is crucial for ensuring system security. This paper addresses the
+interconnected challenges of ETD and ECF in smart grid systems. The proposed
+solution combines long short-term memory (LSTM) and a denoising diffusion
+probabilistic model (DDPM) to generate input reconstruction and forecasting. By
+leveraging the reconstruction and forecasting errors, the system identifies
+instances of energy theft, with the methods based on reconstruction error and
+forecasting error complementing each other in detecting different types of
+attacks. Through extensive experiments on real-world and synthetic datasets,
+the proposed scheme outperforms baseline methods in ETD and ECF problems. The
+ensemble method significantly enhances ETD performance, accurately detecting
+energy theft attacks that baseline methods fail to detect. The research offers
+a comprehensive and effective solution for addressing ETD and ECF challenges,
+demonstrating promising results and improved security in smart grid systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collaborative causal inference on distributed data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.07898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.07898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuji Kawamata, Ryoki Motai, Yukihiko Okada, Akira Imakura, Tetsuya Sakurai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of technologies for causal inference with the privacy
+preservation of distributed data has attracted considerable attention in recent
+years. To address this issue, we propose a data collaboration quasi-experiment
+(DC-QE) that enables causal inference from distributed data with privacy
+preservation. In our method, first, local parties construct
+dimensionality-reduced intermediate representations from the private data.
+Second, they share intermediate representations, instead of private data for
+privacy preservation. Third, propensity scores were estimated from the shared
+intermediate representations. Finally, the treatment effects were estimated
+from propensity scores. Our method can reduce both random errors and biases,
+whereas existing methods can only reduce random errors in the estimation of
+treatment effects. Through numerical experiments on both artificial and
+real-world data, we confirmed that our method can lead to better estimation
+results than individual analyses. Dimensionality-reduction loses some of the
+information in the private data and causes performance degradation. However, we
+observed that in the experiments, sharing intermediate representations with
+many parties to resolve the lack of subjects and covariates, our method
+improved performance enough to overcome the degradation caused by
+dimensionality-reduction. With the spread of our method, intermediate
+representations can be published as open data to help researchers find
+causalities and accumulated as a knowledge base.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-Temporal Data Mining for Ocean Science: Data, Methodologies, and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10803v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10803v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanchen Yang, Wengen Li, Shuyu Wang, Hui Li, Jihong Guan, Shuigeng Zhou, Jiannong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid amassing of spatial-temporal (ST) ocean data, many
+spatial-temporal data mining (STDM) studies have been conducted to address
+various oceanic issues, including climate forecasting and disaster warning.
+Compared with typical ST data (e.g., traffic data), ST ocean data is more
+complicated but with unique characteristics, e.g., diverse regionality and high
+sparsity. These characteristics make it difficult to design and train STDM
+models on ST ocean data. To the best of our knowledge, a comprehensive survey
+of existing studies remains missing in the literature, which hinders not only
+computer scientists from identifying the research issues in ocean data mining
+but also ocean scientists to apply advanced STDM techniques. In this paper, we
+provide a comprehensive survey of existing STDM studies for ocean science.
+Concretely, we first review the widely-used ST ocean datasets and highlight
+their unique characteristics. Then, typical ST ocean data quality enhancement
+techniques are explored. Next, we classify existing STDM studies in ocean
+science into four types of tasks, i.e., prediction, event detection, pattern
+mining, and anomaly detection, and elaborate on the techniques for these tasks.
+Finally, promising research opportunities are discussed. This survey can help
+scientists from both computer science and ocean science better understand the
+fundamental concepts, key techniques, and open challenges of STDM for ocean
+science.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Introduction to Bi-level Optimization: Foundations and Applications
+  in Signal Processing and Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihua Zhang, Prashant Khanduri, Ioannis Tsaknakis, Yuguang Yao, Mingyi Hong, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, bi-level optimization (BLO) has taken center stage in some very
+exciting developments in the area of signal processing (SP) and machine
+learning (ML). Roughly speaking, BLO is a classical optimization problem that
+involves two levels of hierarchy (i.e., upper and lower levels), wherein
+obtaining the solution to the upper-level problem requires solving the
+lower-level one. BLO has become popular largely because it is powerful in
+modeling problems in SP and ML, among others, that involve optimizing nested
+objective functions. Prominent applications of BLO range from resource
+allocation for wireless systems to adversarial machine learning. In this work,
+we focus on a class of tractable BLO problems that often appear in SP and ML
+applications. We provide an overview of some basic concepts of this class of
+BLO problems, such as their optimality conditions, standard algorithms
+(including their optimization principles and practical implementations), as
+well as how they can be leveraged to obtain state-of-the-art results for a
+number of key SP and ML applications. Further, we discuss some recent advances
+in BLO theory, its implications for applications, and point out some
+limitations of the state-of-the-art that require significant future research
+efforts. Overall, we hope that this article can serve to accelerate the
+adoption of BLO as a generic tool to model, analyze, and innovate on a wide
+array of emerging SP and ML applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Missing Value Filling Model Based on Feature Fusion Enhanced
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.13495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.13495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyao Liu, Shengdong Du, Tianrui Li, Fei Teng, Yan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advent of the big data era, the data quality problem is becoming
+more critical. Among many factors, data with missing values is one primary
+issue, and thus developing effective imputation models is a key topic in the
+research community. Recently, a major research direction is to employ neural
+network models such as self-organizing mappings or automatic encoders for
+filling missing values. However, these classical methods can hardly discover
+interrelated features and common features simultaneously among data attributes.
+Especially, it is a very typical problem for classical autoencoders that they
+often learn invalid constant mappings, which dramatically hurts the filling
+performance. To solve the above-mentioned problems, we propose a
+missing-value-filling model based on a feature-fusion-enhanced autoencoder. We
+first incorporate into an autoencoder a hidden layer that consists of
+de-tracking neurons and radial basis function neurons, which can enhance the
+ability of learning interrelated features and common features. Besides, we
+develop a missing value filling strategy based on dynamic clustering that is
+incorporated into an iterative optimization process. This design can enhance
+the multi-dimensional feature fusion ability and thus improves the dynamic
+collaborative missing-value-filling performance. The effectiveness of the
+proposed model is validated by extensive experiments compared to a variety of
+baseline methods on thirteen data sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIRACLE: Multi-task Learning based Interpretable Regulation of
+  Autoimmune Diseases through Common Latent Epigenetics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13866v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13866v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Xu, Jinpu Cai, Yulin Gao, Ziqi Rong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DNA methylation is a crucial regulator of gene transcription and has been
+linked to various diseases, including autoimmune diseases and cancers. However,
+diagnostics based on DNA methylation face challenges due to large feature sets
+and small sample sizes, resulting in overfitting and suboptimal performance. To
+address these issues, we propose MIRACLE, a novel interpretable neural network
+that leverages autoencoder-based multi-task learning to integrate multiple
+datasets and jointly identify common patterns in DNA methylation.
+  MIRACLE's architecture reflects the relationships between methylation sites,
+genes, and pathways, ensuring biological interpretability and meaningfulness.
+The network comprises an encoder and a decoder, with a bottleneck layer
+representing pathway information as the basic unit of heredity. Customized
+defined MaskedLinear Layer is constrained by site-gene-pathway graph adjacency
+matrix information, which provides explainability and expresses the
+site-gene-pathway hierarchical structure explicitly. And from the embedding,
+there are different multi-task classifiers to predict diseases.
+  Tested on six datasets, including rheumatoid arthritis, systemic lupus
+erythematosus, multiple sclerosis, inflammatory bowel disease, psoriasis, and
+type 1 diabetes, MIRACLE demonstrates robust performance in identifying common
+functions of DNA methylation across different phenotypes, with higher accuracy
+in prediction dieseases than baseline methods. By incorporating biological
+prior knowledge, MIRACLE offers a meaningful and interpretable framework for
+DNA methylation data analysis in the context of autoimmune diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional Contrastive Split Learning for Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11435v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11435v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwei Sun, Hideya Ochiai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) based on multi-modal data facilitates
+real-life applications such as home robots and medical diagnoses. One
+significant challenge is to devise a robust decentralized learning framework
+for various client models where centralized data collection is refrained due to
+confidentiality concerns. This work aims to tackle privacy-preserving VQA by
+decoupling a multi-modal model into representation modules and a contrastive
+module and leveraging inter-module gradients sharing and inter-client weight
+sharing. To this end, we propose Bidirectional Contrastive Split Learning
+(BiCSL) to train a global multi-modal model on the entire data distribution of
+decentralized clients. We employ the contrastive loss that enables a more
+efficient self-supervised learning of decentralized modules. Comprehensive
+experiments are conducted on the VQA-v2 dataset based on five SOTA VQA models,
+demonstrating the effectiveness of the proposed method. Furthermore, we inspect
+BiCSL's robustness against a dual-key backdoor attack on VQA. Consequently,
+BiCSL shows much better robustness to the multi-modal adversarial attack
+compared to the centralized learning method, which provides a promising
+approach to decentralized multi-modal learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relationship between Batch Size and Number of Steps Needed for Nonconvex
+  Optimization of Stochastic Gradient Descent using Armijo Line Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Tsukada, Hideaki Iiduka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent (SGD) is the simplest deep learning optimizer
+with which to train deep neural networks. While SGD can use various learning
+rates, such as constant or diminishing rates, the previous numerical results
+showed that SGD performs better than other deep learning optimizers using when
+it uses learning rates given by line search methods. In this paper, we perform
+a convergence analysis on SGD with a learning rate given by an Armijo line
+search for nonconvex optimization. The analysis indicates that the upper bound
+of the expectation of the squared norm of the full gradient becomes small when
+the number of steps and the batch size are large. Next, we show that, for SGD
+with the Armijo-line-search learning rate, the number of steps needed for
+nonconvex optimization is a monotone decreasing convex function of the batch
+size; that is, the number of steps needed for nonconvex optimization decreases
+as the batch size increases. Furthermore, we show that the stochastic
+first-order oracle (SFO) complexity, which is the stochastic gradient
+computation cost, is a convex function of the batch size; that is, there exists
+a critical batch size that minimizes the SFO complexity. Finally, we provide
+numerical results that support our theoretical results. The numerical results
+indicate that the number of steps needed for training deep neural networks
+decreases as the batch size increases and that there exist the critical batch
+sizes that can be estimated from the theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models:
+  A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Fan, Cen Chen, Chengyu Wang, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and large language models have emerged as leading-edge
+generative models and have sparked a revolutionary impact on various aspects of
+human life. However, the practical implementation of these models has also
+exposed inherent risks, highlighting their dual nature and raising concerns
+regarding their trustworthiness. Despite the abundance of literature on this
+subject, a comprehensive survey specifically delving into the intersection of
+large-scale generative models and their trustworthiness remains largely absent.
+To bridge this gap, This paper investigates both the long-standing and emerging
+threats associated with these models across four fundamental dimensions:
+privacy, security, fairness, and responsibility. In this way, we construct an
+extensive map outlining the trustworthiness of these models, while also
+providing practical recommendations and identifying future directions. These
+efforts are crucial for promoting the trustworthy deployment of these models,
+ultimately benefiting society as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>draft version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Calibration in Dense Classification with Adaptive Label
+  Perturbation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Changkun Ye, Shan Wang, Ruikai Cui, Jing Zhang, Kaihao Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safety-related applications, it is crucial to produce trustworthy deep
+neural networks whose prediction is associated with confidence that can
+represent the likelihood of correctness for subsequent decision-making.
+Existing dense binary classification models are prone to being over-confident.
+To improve model calibration, we propose Adaptive Stochastic Label Perturbation
+(ASLP) which learns a unique label perturbation level for each training image.
+ASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,
+which unifies label perturbation processes including stochastic approaches
+(like DisturbLabel), and label smoothing, to correct calibration while
+maintaining classification rates. ASLP follows Maximum Entropy Inference of
+classic statistical mechanics to maximise prediction entropy with respect to
+missing information. It performs this while: (1) preserving classification
+accuracy on known data as a conservative solution, or (2) specifically improves
+model calibration degree by minimising the gap between the prediction accuracy
+and expected confidence of the target training label. Extensive results
+demonstrate that ASLP can significantly improve calibration degrees of dense
+binary classification models on both in-distribution and out-of-distribution
+data. The code is available on https://github.com/Carlisle-Liu/ASLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodality Helps Unimodality: Cross-Modal Few-Shot Learning with
+  Multimodal Models <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06267v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06267v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiu Lin, Samuel Yu, Zhiyi Kuang, Deepak Pathak, Deva Ramanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to quickly learn a new task with minimal instruction - known as
+few-shot learning - is a central aspect of intelligent agents. Classical
+few-shot benchmarks make use of few-shot samples from a single modality, but
+such samples may not be sufficient to characterize an entire concept class. In
+contrast, humans use cross-modal information to learn new concepts efficiently.
+In this work, we demonstrate that one can indeed build a better ${\bf visual}$
+dog classifier by ${\bf read}$ing about dogs and ${\bf listen}$ing to them
+bark. To do so, we exploit the fact that recent multimodal foundation models
+such as CLIP are inherently cross-modal, mapping different modalities to the
+same representation space. Specifically, we propose a simple cross-modal
+adaptation approach that learns from few-shot examples spanning different
+modalities. By repurposing class names as additional one-shot training samples,
+we achieve SOTA results with an embarrassingly simple linear classifier for
+vision-language adaptation. Furthermore, we show that our approach can benefit
+existing methods such as prefix tuning, adapters, and classifier ensembling.
+Finally, to explore other modalities beyond vision and language, we construct
+the first (to our knowledge) audiovisual few-shot benchmark and use cross-modal
+training to improve the performance of both image and audio classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023. Project website:
+  https://linzhiqiu.github.io/papers/cross_modal/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ROME: Robustifying Memory-Efficient NAS via Topology Disentanglement and
+  Gradient Accumulation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.11233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.11233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxing Wang, Xiangxiang Chu, Yuda Fan, Zhexi Zhang, Bo Zhang, Xiaokang Yang, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Albeit being a prevalent architecture searching approach, differentiable
+architecture search (DARTS) is largely hindered by its substantial memory cost
+since the entire supernet resides in the memory. This is where the single-path
+DARTS comes in, which only chooses a single-path submodel at each step. While
+being memory-friendly, it also comes with low computational costs. Nonetheless,
+we discover a critical issue of single-path DARTS that has not been primarily
+noticed. Namely, it also suffers from severe performance collapse since too
+many parameter-free operations like skip connections are derived, just like
+DARTS does. In this paper, we propose a new algorithm called RObustifying
+Memory-Efficient NAS (ROME) to give a cure. First, we disentangle the topology
+search from the operation search to make searching and evaluation consistent.
+We then adopt Gumbel-Top2 reparameterization and gradient accumulation to
+robustify the unwieldy bi-level optimization. We verify ROME extensively across
+15 benchmarks to demonstrate its effectiveness and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computer Vision Estimation of Emotion Reaction Intensity in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10741v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10741v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Qian, Ali Kargarandehkordi, Onur Cezmi Mutlu, Saimourya Surabhi, Mohammadmahdi Honarmand, Dennis Paul Wall, Peter Washington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotions play an essential role in human communication. Developing computer
+vision models for automatic recognition of emotion expression can aid in a
+variety of domains, including robotics, digital behavioral healthcare, and
+media analytics. There are three types of emotional representations which are
+traditionally modeled in affective computing research: Action Units, Valence
+Arousal (VA), and Categorical Emotions. As part of an effort to move beyond
+these representations towards more fine-grained labels, we describe our
+submission to the newly introduced Emotional Reaction Intensity (ERI)
+Estimation challenge in the 5th competition for Affective Behavior Analysis
+in-the-Wild (ABAW). We developed four deep neural networks trained in the
+visual domain and a multimodal model trained with both visual and audio
+features to predict emotion reaction intensity. Our best performing model on
+the Hume-Reaction dataset achieved an average Pearson correlation coefficient
+of 0.4080 on the test set using a pre-trained ResNet50 model. This work
+provides a first step towards the development of production-grade models which
+predict emotion reaction intensities rather than discrete emotion categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical Inference of Constrained Stochastic Optimization via
+  Sketched Sequential Quadratic Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13687v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13687v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Na, Michael W. Mahoney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider statistical inference of equality-constrained stochastic
+nonlinear optimization problems. We develop a fully online stochastic
+sequential quadratic programming (StoSQP) method to solve the problems, which
+can be regarded as applying Newton's method to the first-order optimality
+conditions (i.e., the KKT conditions). Motivated by recent designs of numerical
+second-order methods, we allow StoSQP to adaptively select any random stepsize
+$\bar{\alpha}_t$, as long as $\beta_t\leq \bar{\alpha}_t \leq \beta_t+\chi_t$,
+for some control sequences $\beta_t$ and $\chi_t=o(\beta_t)$. To reduce the
+dominant computational cost of second-order methods, we additionally allow
+StoSQP to inexactly solve quadratic programs via efficient randomized iterative
+solvers that utilize sketching techniques. Notably, we do not require the
+approximation error to diminish as iteration proceeds. For the developed
+method, we show that under mild assumptions (i) computationally, it can take at
+most $O(1/\epsilon^4)$ iterations (same as samples) to attain
+$\epsilon$-stationarity; (ii) statistically, its primal-dual sequence
+$1/\sqrt{\beta_t}\cdot (x_t - x^\star, \lambda_t - \lambda^\star)$ converges to
+a mean-zero Gaussian distribution with a nontrivial covariance matrix depending
+on the underlying sketching distribution. Additionally, we establish the
+almost-sure convergence rate of the iterate $(x_t, \lambda_t)$ along with the
+Berry-Esseen bound; the latter quantitatively measures the convergence rate of
+the distribution function. We analyze a plug-in limiting covariance matrix
+estimator, and demonstrate the performance of the method both on benchmark
+nonlinear problems in CUTEst test set and on linearly/nonlinearly constrained
+regression problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>57 pages, 3 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Increases Global Access to Reliable Flood Forecasts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16104v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16104v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grey Nearing, Deborah Cohen, Vusumuzi Dube, Martin Gauch, Oren Gilon, Shaun Harrigan, Avinatan Hassidim, Frederik Kratzert, Asher Metzger, Sella Nevo, Florian Pappenberger, Christel Prudhomme, Guy Shalev, Shlomo Shenzis, Tadele Tekalign, Dana Weitzner, Yoss Matias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Floods are one of the most common and impactful natural disasters, with a
+disproportionate impact in developing countries that often lack dense
+streamflow monitoring networks. Accurate and timely warnings are critical for
+mitigating flood risks, but accurate hydrological simulation models typically
+must be calibrated to long data records in each watershed where they are
+applied. We developed an Artificial Intelligence (AI) model to predict extreme
+hydrological events at timescales up to 7 days in advance. This model
+significantly outperforms current state of the art global hydrology models (the
+Copernicus Emergency Management Service Global Flood Awareness System) across
+all continents, lead times, and return periods. AI is especially effective at
+forecasting in ungauged basins, which is important because only a few percent
+of the world's watersheds have stream gauges, with a disproportionate number of
+ungauged basins in developing countries that are especially vulnerable to the
+human impacts of flooding. We produce forecasts of extreme events in South
+America and Africa that achieve reliability approaching the current state of
+the art in Europe and North America, and we achieve reliability at between 4
+and 6-day lead times that are similar to current state of the art nowcasts
+(0-day lead time). Additionally, we achieve accuracies over 10-year return
+period events that are similar to current accuracies over 2-year return period
+events, meaning that AI can provide warnings earlier and over larger and more
+impactful events. The model that we develop in this paper has been incorporated
+into an operational early warning system that produces publicly available (free
+and open) forecasts in real time in over 80 countries. This work using AI and
+open data highlights a need for increasing the availability of hydrological
+data to continue to improve global access to reliable flood warnings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlearning Spurious Correlations in Chest X-ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01119v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01119v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Misgina Tsighe Hagos, Kathleen M. Curran, Brian Mac Namee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification models are frequently trained using training
+datasets derived from multiple data sources. While leveraging multiple data
+sources is crucial for achieving model generalization, it is important to
+acknowledge that the diverse nature of these sources inherently introduces
+unintended confounders and other challenges that can impact both model accuracy
+and transparency. A notable confounding factor in medical image classification,
+particularly in musculoskeletal image classification, is skeletal
+maturation-induced bone growth observed during adolescence. We train a deep
+learning model using a Covid-19 chest X-ray dataset and we showcase how this
+dataset can lead to spurious correlations due to unintended confounding
+regions. eXplanation Based Learning (XBL) is a deep learning approach that goes
+beyond interpretability by utilizing model explanations to interactively
+unlearn spurious correlations. This is achieved by integrating interactive user
+feedback, specifically feature annotations. In our study, we employed two
+non-demanding manual feedback mechanisms to implement an XBL-based approach for
+effectively eliminating these spurious correlations. Our results underscore the
+promising potential of XBL in constructing robust models even in the presence
+of confounding factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Discovery Science 2023 conference. arXiv admin note:
+  text overlap with arXiv:2307.06026</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exponential Concentration of Stochastic Approximation with Non-vanishing
+  Gradient 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.07243v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.07243v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kody Law, Neil Walton, Shangda Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze the behavior of stochastic approximation algorithms where
+iterates, in expectation, make progress towards an objective at each step. When
+progress is proportional to the step size of the algorithm, we prove
+exponential concentration bounds. These tail-bounds contrast asymptotic
+normality results which are more frequently associated with stochastic
+approximation. The methods that we develop rely on a geometric ergodicity
+proof. This extends a result on Markov chains due to Hajek (1982) to the area
+of stochastic approximation algorithms. For Projected Stochastic Gradient
+Descent with a non-vanishing gradient, our results can be used to prove
+$O(1/t)$ and linear convergence rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling Multi-view Representations Beyond Inductive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanzhou Ke, Yang Yu, Guoqing Chao, Xiaoli Wang,  Chenyang,  Xu, Shengfeng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view (or -modality) representation learning aims to understand the
+relationships between different view representations. Existing methods
+disentangle multi-view representations into consistent and view-specific
+representations by introducing strong inductive biases, which can limit their
+generalization ability. In this paper, we propose a novel multi-view
+representation disentangling method that aims to go beyond inductive biases,
+ensuring both interpretability and generalizability of the resulting
+representations. Our method is based on the observation that discovering
+multi-view consistency in advance can determine the disentangling information
+boundary, leading to a decoupled learning objective. We also found that the
+consistency can be easily extracted by maximizing the transformation invariance
+and clustering consistency between views. These observations drive us to
+propose a two-stage framework. In the first stage, we obtain multi-view
+consistency by training a consistent encoder to produce semantically-consistent
+representations across views as well as their corresponding pseudo-labels. In
+the second stage, we disentangle specificity from comprehensive representations
+by minimizing the upper bound of mutual information between consistent and
+comprehensive representations. Finally, we reconstruct the original data by
+concatenating pseudo-labels and view-specific representations. Our experiments
+on four multi-view datasets demonstrate that our proposed method outperforms 12
+comparison methods in terms of clustering and classification performance. The
+visualization results also show that the extracted consistency and specificity
+are compact and interpretable. Our code can be found at
+\url{https://github.com/Guanzhou-Ke/DMRIB}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MusicLDM: Enhancing Novelty in Text-to-Music Generation Using
+  Beat-Synchronous Mixup Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Chen, Yusong Wu, Haohe Liu, Marianna Nezhurina, Taylor Berg-Kirkpatrick, Shlomo Dubnov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have shown promising results in cross-modal generation
+tasks, including text-to-image and text-to-audio generation. However,
+generating music, as a special type of audio, presents unique challenges due to
+limited availability of music data and sensitive issues related to copyright
+and plagiarism. In this paper, to tackle these challenges, we first construct a
+state-of-the-art text-to-music model, MusicLDM, that adapts Stable Diffusion
+and AudioLDM architectures to the music domain. We achieve this by retraining
+the contrastive language-audio pretraining model (CLAP) and the Hifi-GAN
+vocoder, as components of MusicLDM, on a collection of music data samples.
+Then, to address the limitations of training data and to avoid plagiarism, we
+leverage a beat tracking model and propose two different mixup strategies for
+data augmentation: beat-synchronous audio mixup and beat-synchronous latent
+mixup, which recombine training audio directly or via a latent embeddings
+space, respectively. Such mixup strategies encourage the model to interpolate
+between musical training samples and generate new music within the convex hull
+of the training data, making the generated music more diverse while still
+staying faithful to the corresponding style. In addition to popular evaluation
+metrics, we design several new evaluation metrics based on CLAP score to
+demonstrate that our proposed MusicLDM and beat-synchronous mixup strategies
+improve both the quality and novelty of generated music, as well as the
+correspondence between input text and generated music.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures, 2 tables, demo page: https://musicldm.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Causality-inspired Representation Consistency for Video Anomaly
+  Detection <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Zhaoyang Xia, Mengyang Zhao, Donglai Wei, Yuzheng Wang, Liu Siao, Bobo Ju, Gaoyun Fang, Jing Liu, Liang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection is an essential yet challenging task in the
+multimedia community, with promising applications in smart cities and secure
+communities. Existing methods attempt to learn abstract representations of
+regular events with statistical dependence to model the endogenous normality,
+which discriminates anomalies by measuring the deviations to the learned
+distribution. However, conventional representation learning is only a crude
+description of video normality and lacks an exploration of its underlying
+causality. The learned statistical dependence is unreliable for diverse regular
+events in the real world and may cause high false alarms due to
+overgeneralization. Inspired by causal representation learning, we think that
+there exists a causal variable capable of adequately representing the general
+patterns of regular events in which anomalies will present significant
+variations. Therefore, we design a causality-inspired representation
+consistency (CRC) framework to implicitly learn the unobservable causal
+variables of normality directly from available normal videos and detect
+abnormal events with the learned representation consistency. Extensive
+experiments show that the causality-inspired normality is robust to regular
+events with label-independent shifts, and the proposed CRC framework can
+quickly and accurately detect various complicated anomalies from real-world
+surveillance videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13501v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13501v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Morelli, Alberto Baldrati, Giuseppe Cartella, Marcella Cornia, Marco Bertini, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapidly evolving fields of e-commerce and metaverse continue to seek
+innovative approaches to enhance the consumer experience. At the same time,
+recent advancements in the development of diffusion models have enabled
+generative networks to create remarkably realistic images. In this context,
+image-based virtual try-on, which consists in generating a novel image of a
+target model wearing a given in-shop garment, has yet to capitalize on the
+potential of these powerful generative solutions. This work introduces
+LaDI-VTON, the first Latent Diffusion textual Inversion-enhanced model for the
+Virtual Try-ON task. The proposed architecture relies on a latent diffusion
+model extended with a novel additional autoencoder module that exploits
+learnable skip connections to enhance the generation process preserving the
+model's characteristics. To effectively maintain the texture and details of the
+in-shop garment, we propose a textual inversion component that can map the
+visual features of the garment to the CLIP token embedding space and thus
+generate a set of pseudo-word token embeddings capable of conditioning the
+generation process. Experimental results on Dress Code and VITON-HD datasets
+demonstrate that our approach outperforms the competitors by a consistent
+margin, achieving a significant milestone for the task. Source code and trained
+models are publicly available at: https://github.com/miccunifi/ladi-vton.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable In-the-Wild Video Quality Assessment: A Database and
+  a Language-<span class="highlight-title">Prompt</span>ed Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoning Wu, Erli Zhang, Liang Liao, Chaofeng Chen, Jingwen Hou, Annan Wang, Wenxiu Sun, Qiong Yan, Weisi Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of in-the-wild videos has greatly expanded the Video
+Quality Assessment (VQA) problem. Unlike early definitions that usually focus
+on limited distortion types, VQA on in-the-wild videos is especially
+challenging as it could be affected by complicated factors, including various
+distortions and diverse contents. Though subjective studies have collected
+overall quality scores for these videos, how the abstract quality scores relate
+with specific factors is still obscure, hindering VQA methods from more
+concrete quality evaluations (e.g. sharpness of a video). To solve this
+problem, we collect over two million opinions on 4,543 in-the-wild videos on 13
+dimensions of quality-related factors, including in-capture authentic
+distortions (e.g. motion blur, noise, flicker), errors introduced by
+compression and transmission, and higher-level experiences on semantic contents
+and aesthetic issues (e.g. composition, camera trajectory), to establish the
+multi-dimensional Maxwell database. Specifically, we ask the subjects to label
+among a positive, a negative, and a neutral choice for each dimension. These
+explanation-level opinions allow us to measure the relationships between
+specific quality factors and abstract subjective quality ratings, and to
+benchmark different categories of VQA algorithms on each dimension, so as to
+more comprehensively analyze their strengths and weaknesses. Furthermore, we
+propose the MaxVQA, a language-prompted VQA approach that modifies
+vision-language foundation model CLIP to better capture important quality
+issues as observed in our analyses. The MaxVQA can jointly evaluate various
+specific quality factors and final quality scores with state-of-the-art
+accuracy on all dimensions, and superb generalization ability on existing
+datasets. Code and data available at https://github.com/VQAssessment/MaxVQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 31st ACM International Conference on Multimedia
+  (MM '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-02T00:00:00Z">2023-08-02</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">45</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More Context, Less Distraction: Visual Classification by Inferring and
+  Conditioning on Contextual Attributes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang An, Sicheng Zhu, Michael-Andrei Panaitescu-Liess, Chaithanya Kumar Mummadi, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP, as a foundational vision language model, is widely used in zero-shot
+image classification due to its ability to understand various visual concepts
+and natural language descriptions. However, how to fully leverage CLIP's
+unprecedented human-like understanding capabilities to achieve better zero-shot
+classification is still an open question. This paper draws inspiration from the
+human visual perception process: a modern neuroscience view suggests that in
+classifying an object, humans first infer its class-independent attributes
+(e.g., background and orientation) which help separate the foreground object
+from the background, and then make decisions based on this information.
+Inspired by this, we observe that providing CLIP with contextual attributes
+improves zero-shot classification and mitigates reliance on spurious features.
+We also observe that CLIP itself can reasonably infer the attributes from an
+image. With these observations, we propose a training-free, two-step zero-shot
+classification method named PerceptionCLIP. Given an image, it first infers
+contextual attributes (e.g., background) and then performs object
+classification conditioning on them. Our experiments show that PerceptionCLIP
+achieves better generalization, group robustness, and better interpretability.
+For example, PerceptionCLIP with ViT-L/14 improves the worst group accuracy by
+16.5% on the Waterbirds dataset and by 3.5% on CelebA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fighting Fire with Fire: Can Chat<span class="highlight-title">GPT</span> Detect AI-generated Text? <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amrita Bhattacharjee, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as ChatGPT are increasingly being used for
+various use cases, including text content generation at scale. Although
+detection methods for such AI-generated text exist already, we investigate
+ChatGPT's performance as a detector on such AI-generated text, inspired by
+works that use ChatGPT as a data labeler or annotator. We evaluate the
+zero-shot performance of ChatGPT in the task of human-written vs. AI-generated
+text detection, and perform experiments on publicly available datasets. We
+empirically investigate if ChatGPT is symmetrically effective in detecting
+AI-generated or human-written text. Our findings provide insight on how ChatGPT
+and similar LLMs may be leveraged in automated detection pipelines by simply
+focusing on solving a specific aspect of the problem and deriving the rest from
+that solution. All code and data is available at
+\url{https://github.com/AmritaBh/ChatGPT-as-Detector}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear in SIGKDD Explorations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the psychology of <span class="highlight-title">GPT</span>-4's Moral and Legal Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilherme F. C. F. Almeida, José Luiz Nunes, Neele Engelmann, Alex Wiegmann, Marcelo de Araújo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have been used as the foundation of highly
+sophisticated artificial intelligences, capable of delivering human-like
+responses to probes about legal and moral issues. However, these models are
+unreliable guides to their own inner workings, and even the engineering teams
+behind their creation are unable to explain exactly how they came to develop
+all of the capabilities they currently have. The emerging field of machine
+psychology seeks to gain insight into the processes and concepts that these
+models possess. In this paper, we employ the methods of psychology to probe
+into GPT-4's moral and legal reasoning. More specifically, we investigate the
+similarities and differences between GPT-4 and humans when it comes to
+intentionality ascriptions, judgments about causation, the morality of
+deception, moral foundations, the impact of moral luck on legal judgments, the
+concept of consent, and rule violation judgments. We find high correlations
+between human and AI responses, but also several significant systematic
+differences between them. We conclude with a discussion of the philosophical
+implications of our findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Röttger, Hannah Rose Kirk, Bertie Vidgen, Giuseppe Attanasio, Federico Bianchi, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Without proper safeguards, large language models will readily follow
+malicious instructions and generate toxic content. This motivates safety
+efforts such as red-teaming and large-scale feedback learning, which aim to
+make models both helpful and harmless. However, there is a tension between
+these two objectives, since harmlessness requires models to refuse complying
+with unsafe prompts, and thus not be helpful. Recent anecdotal evidence
+suggests that some models may have struck a poor balance, so that even clearly
+safe prompts are refused if they use similar language to unsafe prompts or
+mention sensitive topics. In this paper, we introduce a new test suite called
+XSTest to identify such eXaggerated Safety behaviours in a structured and
+systematic way. In its current form, XSTest comprises 200 safe prompts across
+ten prompt types that well-calibrated models should not refuse to comply with.
+We describe XSTest's creation and composition, and use the test suite to
+highlight systematic failure modes in a recently-released state-of-the-art
+language model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v1 to document initial data release</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Instruction-Tuned Large Language Models on Code Comprehension
+  and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqiang Yuan, Junwei Liu, Qiancheng Zi, Mingwei Liu, Xin Peng, Yiling Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we evaluate 10 open-source instructed LLMs on four
+representative code comprehension and generation tasks. We have the following
+main findings. First, for the zero-shot setting, instructed LLMs are very
+competitive on code comprehension and generation tasks and sometimes even
+better than small SOTA models specifically fine-tuned on each downstream task.
+We also find that larger instructed LLMs are not always better on code-related
+tasks. Second, for the few-shot setting, we find that adding demonstration
+examples substantially helps instructed LLMs perform better on most code
+comprehension and generation tasks; however, the examples would sometimes
+induce unstable or even worse performance. Furthermore, we find widely-used
+BM25-based shot selection strategy significantly outperforms the basic random
+selection or fixed selection only on generation problems. Third, for the
+fine-tuning setting, we find that fine-tuning could further improve the model
+performance on downstream code comprehension and generation tasks compared to
+the zero-shot/one-shot performance. In addition, after being fine-tuned on the
+same downstream task dataset, instructed LLMs outperform both the small SOTA
+models and similar-scaled LLMs without instruction tuning. Based on our
+findings, we further present practical implications on model and usage
+recommendation, performance and cost trade-offs, and future direction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounded Image Text Matching with Mismatched Relation Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Wu, Yana Wei, Haozhe Wang, Yongfei Liu, Sibei Yang, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Grounded Image Text Matching with Mismatched Relation
+(GITM-MR), a novel visual-linguistic joint task that evaluates the relation
+understanding capabilities of transformer-based pre-trained models. GITM-MR
+requires a model to first determine if an expression describes an image, then
+localize referred objects or ground the mismatched parts of the text. We
+provide a benchmark for evaluating pre-trained models on this task, with a
+focus on the challenging settings of limited data and out-of-distribution
+sentence lengths. Our evaluation demonstrates that pre-trained models lack data
+efficiency and length generalization ability. To address this, we propose the
+Relation-sensitive Correspondence Reasoning Network (RCRN), which incorporates
+relation-aware reasoning via bi-directional message propagation guided by
+language structure. RCRN can be interpreted as a modular program and delivers
+strong performance in both length generalization and data efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Multilingual Language Models Think Better in English? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julen Etxaniz, Gorka Azkune, Aitor Soroa, Oier Lopez de Lacalle, Mikel Artetxe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Translate-test is a popular technique to improve the performance of
+multilingual language models. This approach works by translating the input into
+English using an external machine translation system, and running inference
+over the translated input. However, these improvements can be attributed to the
+use of a separate translation system, which is typically trained on large
+amounts of parallel data not seen by the language model. In this work, we
+introduce a new approach called self-translate, which overcomes the need of an
+external translation system by leveraging the few-shot translation capabilities
+of multilingual language models. Experiments over 5 tasks show that
+self-translate consistently outperforms direct inference, demonstrating that
+language models are unable to leverage their full multilingual potential when
+prompted in non-English languages. Our code is available at
+https://github.com/juletx/self-translate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Hierarchical Neural Networks using Hierarchical Softmax 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jetze Schuurmans, Flavius Frasincar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework in which hierarchical softmax is used to
+create a global hierarchical classifier. The approach is applicable for any
+classification task where there is a natural hierarchy among classes. We show
+empirical results on four text classification datasets. In all datasets the
+hierarchical softmax improved on the regular softmax used in a flat classifier
+in terms of macro-F1 and macro-recall. In three out of four datasets
+hierarchical softmax achieved a higher micro-accuracy and macro-precision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 35th Symposium on Applied Computing (SAC'20,
+  https://www.sigapp.org/sac/sac2020/), to the Machine Learning and its
+  Applications track (MLA, https://sites.google.com/view/acmsac2020/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Arithmetic with Language Models: from Memorization to Computation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Maltoni, Matteo Ferrara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A better understanding of the emergent computation and problem-solving
+capabilities of recent large language models is of paramount importance to
+further improve them and broaden their applicability. This work investigates
+how a language model, trained to predict the next token, can perform arithmetic
+computations generalizing beyond training data. Binary addition and
+multiplication constitute a good testbed for this purpose, since they require a
+very small vocabulary and exhibit relevant input/output discontinuities making
+smooth input interpolation ineffective for novel data. We successfully trained
+a light language model to learn these tasks and ran a number of experiments to
+investigate the extrapolation capabilities and internal information processing.
+Our findings support the hypotheses that the language model works as an
+Encoding-Regression-Decoding machine where the computation takes place in the
+value space once the input token representation is mapped to an appropriate
+internal representation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADS-Cap: A Framework for Accurate and Diverse Stylized Captioning with
+  Unpaired Stylistic Corpora <span class="chip">NLPCC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanzhi Cheng, Zheng Ma, Shi Zong, Jianbing Zhang, Xinyu Dai, Jiajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating visually grounded image captions with specific linguistic styles
+using unpaired stylistic corpora is a challenging task, especially since we
+expect stylized captions with a wide variety of stylistic patterns. In this
+paper, we propose a novel framework to generate Accurate and Diverse Stylized
+Captions (ADS-Cap). Our ADS-Cap first uses a contrastive learning module to
+align the image and text features, which unifies paired factual and unpaired
+stylistic corpora during the training process. A conditional variational
+auto-encoder is then used to automatically memorize diverse stylistic patterns
+in latent space and enhance diversity through sampling. We also design a simple
+but effective recheck module to boost style accuracy by filtering
+style-specific captions. Experimental results on two widely used stylized image
+captioning datasets show that regarding consistency with the image, style
+accuracy and diversity, ADS-Cap achieves outstanding performances compared to
+various baselines. We finally conduct extensive analyses to understand the
+effectiveness of our method. Our code is available at
+https://github.com/njucckevin/ADS-Cap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Natural Language Processing and Chinese Computing (NLPCC)
+  2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Generic: Enhancing Image Captioning with Real-World Knowledge
+  using Vision-Language <span class="highlight-title">Pre-Train</span>ing Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanzhi Cheng, Wenpo Song, Zheng Ma, Wenhao Zhu, Zixuan Zhu, Jianbing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current captioning approaches tend to generate correct but "generic"
+descriptions that lack real-world knowledge, e.g., named entities and
+contextual information. Considering that Vision-Language Pre-Training (VLP)
+models master massive such knowledge from large-scale web-harvested data, it is
+promising to utilize the generalizability of VLP models to incorporate
+knowledge into image descriptions. However, using VLP models faces challenges:
+zero-shot inference suffers from knowledge hallucination that leads to
+low-quality descriptions, but the generic bias in downstream task fine-tuning
+hinders the VLP model from expressing knowledge. To address these concerns, we
+propose a simple yet effective method called Knowledge-guided Replay
+(K-Replay), which enables the retention of pre-training knowledge during
+fine-tuning. Our approach consists of two parts: (1) a knowledge prediction
+task on automatically collected replay exemplars to continuously awaken the VLP
+model's memory about knowledge, thus preventing the model from collapsing into
+the generic pattern; (2) a knowledge distillation constraint to improve the
+faithfulness of generated descriptions hence alleviating the knowledge
+hallucination. To evaluate knowledge-enhanced descriptions, we construct a
+novel captioning benchmark KnowCap, containing knowledge of landmarks, famous
+brands, special foods and movie characters. Experimental results show that our
+approach effectively incorporates knowledge into descriptions, outperforming
+strong VLP baseline by 20.9 points (78.7->99.6) in CIDEr score and 20.5
+percentage points (34.0%->54.5%) in knowledge recognition accuracy. Our code
+and data is available at https://github.com/njucckevin/KnowCap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia (ACMMM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Few-Shot Data Augmentation and Waterfall <span class="highlight-title">Prompt</span>ing for
+  Response Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lea Krause, Selene Báez Santamaría, Michiel van der Meer, Urja Khurana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper discusses our approaches for task-oriented conversational
+modelling using subjective knowledge, with a particular emphasis on response
+generation. Our methodology was shaped by an extensive data analysis that
+evaluated key factors such as response length, sentiment, and dialogue acts
+present in the provided dataset. We used few-shot learning to augment the data
+with newly generated subjective knowledge items and present three approaches
+for DSTC11: (1) task-specific model exploration, (2) incorporation of the most
+frequent question into all generated responses, and (3) a waterfall prompting
+technique using a combination of both GPT-3 and ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>DSTC11</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat Translation Error Detection for Assisting Cross-lingual
+  Communications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunmeng Li, Jun Suzuki, Makoto Morishita, Kaori Abe, Ryoko Tokuhisa, Ana Brassard, Kentaro Inui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we describe the development of a communication support system
+that detects erroneous translations to facilitate crosslingual communications
+due to the limitations of current machine chat translation methods. We trained
+an error detector as the baseline of the system and constructed a new
+Japanese-English bilingual chat corpus, BPersona-chat, which comprises
+multiturn colloquial chats augmented with crowdsourced quality ratings. The
+error detector can serve as an encouraging foundation for more advanced
+erroneous translation detection systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SALTTS: Leveraging <span class="highlight-title">Self-Supervised</span> Speech Representations for improved
+  Text-to-Speech Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramanan Sivaguru, Vasista Sai Lodagala, S Umesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While FastSpeech2 aims to integrate aspects of speech such as pitch, energy,
+and duration as conditional inputs, it still leaves scope for richer
+representations. As a part of this work, we leverage representations from
+various Self-Supervised Learning (SSL) models to enhance the quality of the
+synthesized speech. In particular, we pass the FastSpeech2 encoder's
+length-regulated outputs through a series of encoder layers with the objective
+of reconstructing the SSL representations. In the SALTTS-parallel
+implementation, the representations from this second encoder are used for an
+auxiliary reconstruction loss with the SSL features. The SALTTS-cascade
+implementation, however, passes these representations through the decoder in
+addition to having the reconstruction loss. The richness of speech
+characteristics from the SSL features reflects in the output speech quality,
+with the objective and subjective evaluation measures of the proposed approach
+outperforming the baseline FastSpeech2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teaching Smaller Language Models To Generalise To Unseen Compositional
+  Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00946v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00946v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Hartill, Neset TAN, Michael Witbrock, Patricia J. Riddle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We equip a smaller Language Model to generalise to answering challenging
+compositional questions that have not been seen in training. To do so we
+propose a combination of multitask supervised pretraining on up to 93 tasks
+designed to instill diverse reasoning abilities, and a dense retrieval system
+that aims to retrieve a set of evidential paragraph fragments. Recent progress
+in question-answering has been achieved either through prompting methods
+against very large pretrained Language Models in zero or few-shot fashion, or
+by fine-tuning smaller models, sometimes in conjunction with information
+retrieval. We focus on the less explored question of the extent to which
+zero-shot generalisation can be enabled in smaller models with retrieval
+against a corpus within which sufficient information to answer a particular
+question may not exist. We establish strong baselines in this setting for
+diverse evaluation datasets (StrategyQA, CommonsenseQA, IIRC, DROP, Musique and
+ARC-DA), and show that performance can be significantly improved by adding
+retrieval-augmented training datasets which are designed to expose our models
+to a variety of heuristic reasoning strategies such as weighing partial
+evidence or ignoring an irrelevant context.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature-aware conditional GAN for category text generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00939v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00939v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinze Li, Kezhi Mao, Fanfan Lin, Zijian Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Category text generation receives considerable attentions since it is
+beneficial for various natural language processing tasks. Recently, the
+generative adversarial network (GAN) has attained promising performance in text
+generation, attributed to its adversarial training process. However, there are
+several issues in text GANs, including discreteness, training instability, mode
+collapse, lack of diversity and controllability etc. To address these issues,
+this paper proposes a novel GAN framework, the feature-aware conditional GAN
+(FA-GAN), for controllable category text generation. In FA-GAN, the generator
+has a sequence-to-sequence structure for improving sentence diversity, which
+consists of three encoders including a special feature-aware encoder and a
+category-aware encoder, and one relational-memory-core-based decoder with the
+Gumbel SoftMax activation function. The discriminator has an additional
+category classification head. To generate sentences with specified categories,
+the multi-class classification loss is supplemented in the adversarial
+training. Comprehensive experiments have been conducted, and the results show
+that FA-GAN consistently outperforms 10 state-of-the-art text generation
+approaches on 6 text classification datasets. The case study demonstrates that
+the synthetic sentences generated by FA-GAN can match the required categories
+and are aware of the features of conditioned sentences, with good readability,
+fluency, and text authenticity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reverse Stable Diffusion: What <span class="highlight-title">prompt</span> was used to generate this image? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models such as Stable Diffusion have recently
+attracted the interest of many researchers, and inverting the diffusion process
+can play an important role in better understanding the generative process and
+how to engineer prompts in order to obtain the desired images. To this end, we
+introduce the new task of predicting the text prompt given an image generated
+by a generative diffusion model. We combine a series of white-box and black-box
+models (with and without access to the weights of the diffusion network) to
+deal with the proposed task. We propose a novel learning framework comprising
+of a joint prompt regression and multi-label vocabulary classification
+objective that generates improved prompts. To further improve our method, we
+employ a curriculum learning procedure that promotes the learning of
+image-prompt pairs with lower labeling noise (i.e. that are better aligned),
+and an unsupervised domain-adaptive kernel learning method that uses the
+similarities between samples in the source and target domains as extra
+features. We conduct experiments on the DiffusionDB data set, predicting text
+prompts from images generated by Stable Diffusion. Our novel learning framework
+produces excellent results on the aforementioned task, yielding the highest
+gains when applied on the white-box model. In addition, we make an interesting
+discovery: training a diffusion model on the prompt generation task can make
+the model generate images that are much better aligned with the input prompts,
+when the model is directly reused for text-to-image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UPB at IberLEF-2023 AuTexTification: Detection of Machine-Generated Text
+  using <span class="highlight-title">Transformer</span> Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrei-Alexandru Preda, Dumitru-Clementin Cercel, Traian Rebedea, Costin-Gabriel Chiru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes the solutions submitted by the UPB team to the
+AuTexTification shared task, featured as part of IberLEF-2023. Our team
+participated in the first subtask, identifying text documents produced by large
+language models instead of humans. The organizers provided a bilingual dataset
+for this subtask, comprising English and Spanish texts covering multiple
+domains, such as legal texts, social media posts, and how-to articles. We
+experimented mostly with deep learning models based on Transformers, as well as
+training techniques such as multi-task learning and virtual adversarial
+training to obtain better results. We submitted three runs, two of which
+consisted of ensemble models. Our best-performing model achieved macro
+F1-scores of 66.63% on the English dataset and 67.10% on the Spanish dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages. Accepted for publication in the IberLEF 2023 Proceedings,
+  at https://ceur-ws.org/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Machine Translation through <span class="highlight-title">Prompt</span> Engineering: An
+  Investigation into Chat<span class="highlight-title">GPT</span>'s Customizability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masaru Yamada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the influence of integrating the purpose of the
+translation and the target audience into prompts on the quality of translations
+produced by ChatGPT. Drawing on previous translation studies, industry
+practices, and ISO standards, the research underscores the significance of the
+pre-production phase in the translation process. The study reveals that the
+inclusion of suitable prompts in large-scale language models like ChatGPT can
+yield flexible translations, a feat yet to be realized by conventional Machine
+Translation (MT). The research scrutinizes the changes in translation quality
+when prompts are used to generate translations that meet specific conditions.
+The evaluation is conducted from a practicing translator's viewpoint, both
+subjectively and qualitatively, supplemented by the use of OpenAI's word
+embedding API for cosine similarity calculations. The findings suggest that the
+integration of the purpose and target audience into prompts can indeed modify
+the generated translations, generally enhancing the translation quality by
+industry standards. The study also demonstrates the practical application of
+the "good translation" concept, particularly in the context of marketing
+documents and culturally dependent idioms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of Chat<span class="highlight-title">GPT</span>-like
+  Models at All Scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT-like models have revolutionized various applications in artificial
+intelligence, from summarization and coding to translation, matching or even
+surpassing human performance. However, the current landscape lacks an
+accessible, efficient, and cost-effective end-to-end RLHF (Reinforcement
+Learning with Human Feedback) training pipeline for these powerful models,
+particularly when training at the scale of billions of parameters. This paper
+introduces DeepSpeed-Chat, a novel system that democratizes RLHF training,
+making it accessible to the AI community. DeepSpeed-Chat offers three key
+capabilities: an easy-to-use training and inference experience for ChatGPT-like
+models, a DeepSpeed-RLHF pipeline that replicates the training pipeline from
+InstructGPT, and a robust DeepSpeed-RLHF system that combines various
+optimizations for training and inference in a unified way. The system delivers
+unparalleled efficiency and scalability, enabling training of models with
+hundreds of billions of parameters in record time and at a fraction of the
+cost. With this development, DeepSpeed-Chat paves the way for broader access to
+advanced RLHF training, even for data scientists with limited resources,
+thereby fostering innovation and further development in the field of AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empirical Translation Process Research: Past and Possible Future
+  Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Carl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past four decades, efforts have been made to develop and evaluate
+models for Empirical Translation Process Research (TPR), yet a comprehensive
+framework remains elusive. This article traces the evolution of empirical TPR
+within the CRITT TPR-DB tradition and proposes the Free Energy Principle (FEP)
+and Active Inference (AIF) as a framework for modeling deeply embedded
+translation processes. It introduces novel approaches for quantifying
+fundamental concepts of Relevance Theory (relevance, s-mode, i-mode), and
+establishes their relation to the Monitor Model, framing relevance maximization
+as a special case of minimizing free energy. FEP/AIF provides a mathematically
+rigorous foundation that enables modeling of deep temporal architectures in
+which embedded translation processes unfold on different timelines. This
+framework opens up exciting prospects for future research in predictive TPR,
+likely to enrich our comprehension of human translation processes, and making
+valuable contributions to the wider realm of translation studies and the design
+of cognitive architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Translation, Cognition and Behavior: "Translation
+  and cognition in the 21st century: Goals met, goals ahead", John Benjamins</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Careful Whisper -- leveraging advances in automatic speech recognition
+  for robust and interpretable aphasia subtype classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurin Wagner, Mario Zusag, Theresa Bloder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a fully automated approach for identifying speech
+anomalies from voice recordings to aid in the assessment of speech impairments.
+By combining Connectionist Temporal Classification (CTC) and
+encoder-decoder-based automatic speech recognition models, we generate rich
+acoustic and clean transcripts. We then apply several natural language
+processing methods to extract features from these transcripts to produce
+prototypes of healthy speech. Basic distance measures from these prototypes
+serve as input features for standard machine learning classifiers, yielding
+human-level accuracy for the distinction between recordings of people with
+aphasia and a healthy control group. Furthermore, the most frequently occurring
+aphasia types can be distinguished with 90% accuracy. The pipeline is directly
+applicable to other diseases and languages, showing promise for robustly
+extracting diagnostic speech biomarkers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Do We Need Neuro-symbolic AI to Model Pragmatic Analogies? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilini Wijesiriwardene, Amit Sheth, Valerie L. Shalin, Amitava Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A hallmark of intelligence is the ability to use a familiar domain to make
+inferences about a less familiar domain, known as analogical reasoning. In this
+article, we delve into the performance of Large Language Models (LLMs) in
+dealing with progressively complex analogies expressed in unstructured text. We
+discuss analogies at four distinct levels of complexity: lexical analogies,
+syntactic analogies, semantic analogies, and pragmatic analogies. As the
+analogies become more complex, they require increasingly extensive, diverse
+knowledge beyond the textual content, unlikely to be found in the lexical
+co-occurrence statistics that power LLMs. To address this, we discuss the
+necessity of employing Neuro-symbolic AI techniques that combine statistical
+and symbolic AI, informing the representation of unstructured text to highlight
+and augment relevant content, provide abstraction and guide the mapping
+process. Our knowledge-informed approach maintains the efficiency of LLMs while
+preserving the ability to explain analogies for pedagogical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MultiEM: Efficient and Effective Unsupervised Multi-Table Entity
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Zeng, Pengfei Wang, Yuren Mao, Lu Chen, Xiaoze Liu, Yunjun Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching (EM), which aims to identify all entity pairs referring to
+the same real-world entity from relational tables, is one of the most important
+tasks in real-world data management systems. Due to the labeling process of EM
+being extremely labor-intensive, unsupervised EM is more applicable than
+supervised EM in practical scenarios. Traditional unsupervised EM assumes that
+all entities come from two tables; however, it is more common to match entities
+from multiple tables in practical applications, that is, multi-table entity
+matching (multi-table EM). Unfortunately, effective and efficient unsupervised
+multi-table EM remains under-explored. To fill this gap, this paper formally
+studies the problem of unsupervised multi-table entity matching and proposes an
+effective and efficient solution, termed as MultiEM. MultiEM is a parallelable
+pipeline of enhanced entity representation, table-wise hierarchical merging,
+and density-based pruning. Extensive experimental results on six real-world
+benchmark datasets demonstrate the superiority of MultiEM in terms of
+effectiveness and efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Camem<span class="highlight-title">BERT</span>-bio: a Tasty French Language Model Better for your Health 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rian Touchent, Laurent Romary, Eric de la Clergerie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical data in hospitals are increasingly accessible for research through
+clinical data warehouses, however these documents are unstructured. It is
+therefore necessary to extract information from medical reports to conduct
+clinical studies. Transfer learning with BERT-like models such as CamemBERT has
+allowed major advances, especially for named entity recognition. However, these
+models are trained for plain language and are less efficient on biomedical
+data. This is why we propose a new French public biomedical dataset on which we
+have continued the pre-training of CamemBERT. Thus, we introduce a first
+version of CamemBERT-bio, a specialized public model for the French biomedical
+domain that shows 2.54 points of F1 score improvement on average on different
+biomedical named entity recognition tasks. Our findings demonstrate the success
+of continual pre-training from a French model and contrast with recent
+proposals on the same domain and language. One of our key contributions
+highlights the importance of using a standard evaluation protocol that enables
+a clear view of the current state-of-the-art for French biomedical models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>refined the terminology used for methodologies, providing more
+  explicit and descriptive labels; expanded the arguments about methodology in
+  the paper, offering a more comprehensive discussion and exploration of the
+  topic; results unchanged</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Detecting Harmful Agendas in News Articles <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00102v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00102v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melanie Subbiah, Amrita Bhattacharjee, Yilun Hua, Tharindu Kumarage, Huan Liu, Kathleen McKeown
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manipulated news online is a growing problem which necessitates the use of
+automated systems to curtail its spread. We argue that while misinformation and
+disinformation detection have been studied, there has been a lack of investment
+in the important open challenge of detecting harmful agendas in news articles;
+identifying harmful agendas is critical to flag news campaigns with the
+greatest potential for real world harm. Moreover, due to real concerns around
+censorship, harmful agenda detectors must be interpretable to be effective. In
+this work, we propose this new task and release a dataset, NewsAgendas, of
+annotated news articles for agenda identification. We show how interpretable
+systems can be effective on this task and demonstrate that they can perform
+comparably to black-box models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera-ready for ACL-WASSA 2023. First two authors contributed
+  equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZRIGF: An Innovative Multimodal Framework for Zero-Resource
+  Image-Grounded Dialogue Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhang, Jian Wang, Hui Ma, Bo Xu, Hongfei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-grounded dialogue systems benefit greatly from integrating visual
+information, resulting in high-quality response generation. However, current
+models struggle to effectively utilize such information in zero-resource
+scenarios, mainly due to the disparity between image and text modalities. To
+overcome this challenge, we propose an innovative multimodal framework, called
+ZRIGF, which assimilates image-grounded information for dialogue generation in
+zero-resource situations. ZRIGF implements a two-stage learning strategy,
+comprising contrastive pre-training and generative pre-training. Contrastive
+pre-training includes a text-image matching module that maps images and texts
+into a unified encoded vector space, along with a text-assisted masked image
+modeling module that preserves pre-training visual features and fosters further
+multimodal feature alignment. Generative pre-training employs a multimodal
+fusion module and an information transfer module to produce insightful
+responses based on harmonized multimodal representations. Comprehensive
+experiments conducted on both text-based and image-grounded dialogue datasets
+demonstrate ZRIGF's efficacy in generating contextually pertinent and
+informative responses. Furthermore, we adopt a fully zero-resource scenario in
+the image-grounded dialogue dataset to demonstrate our framework's robust
+generalization capabilities in novel domains. The code is available at
+https://github.com/zhangbo-nlp/ZRIGF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023 Accpeted, Repo:
+  https://github.com/zhangbo-nlp/ZRIGF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vistaar: Diverse Benchmarks and Training Sets for Indian Language ASR <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15386v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15386v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushal Santosh Bhogale, Sai Sundaresan, Abhigyan Raman, Tahir Javed, Mitesh M. Khapra, Pratyush Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improving ASR systems is necessary to make new LLM-based use-cases accessible
+to people across the globe. In this paper, we focus on Indian languages, and
+make the case that diverse benchmarks are required to evaluate and improve ASR
+systems for Indian languages. To address this, we collate Vistaar as a set of
+59 benchmarks across various language and domain combinations, on which we
+evaluate 3 publicly available ASR systems and 2 commercial systems. We also
+train IndicWhisper models by fine-tuning the Whisper models on publicly
+available training datasets across 12 Indian languages totalling to 10.7K
+hours. We show that IndicWhisper significantly improves on considered ASR
+systems on the Vistaar benchmark. Indeed, IndicWhisper has the lowest WER in 39
+out of the 59 benchmarks, with an average reduction of 4.1 WER. We open-source
+all datasets, code and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Thinking Fast and Slow in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilo Hagendorff, Sarah Fabi, Michal Kosinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are currently at the forefront of intertwining
+AI systems with human communication and everyday life. Therefore, it is of
+great importance to evaluate their emerging abilities. In this study, we show
+that LLMs like GPT-3 exhibit behavior that strikingly resembles human-like
+intuition - and the cognitive errors that come with it. However, LLMs with
+higher cognitive capabilities, in particular ChatGPT and GPT-4, learned to
+avoid succumbing to these errors and perform in a hyperrational manner. For our
+experiments, we probe LLMs with the Cognitive Reflection Test (CRT) as well as
+semantic illusions that were originally designed to investigate intuitive
+decision-making in humans. Our study demonstrates that investigating LLMs with
+methods from psychology has the potential to reveal otherwise unknown emergent
+traits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Private Watermark for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16230v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16230v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aiwei Liu, Leyi Pan, Xuming Hu, Shu'ang Li, Lijie Wen, Irwin King, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, text watermarking algorithms for large language models (LLMs) have
+been mitigating the potential harms of text generated by the LLMs, including
+fake news and copyright issues. However, the watermark detection of current
+text algorithms requires the key from the generation process, making them
+susceptible to breaches and counterfeiting. In this work, we propose the first
+private watermarking algorithm, which extends the current text watermarking
+algorithms by using two different neural networks respectively for watermark
+generation and detection, rather than using the same key at both stages.
+Meanwhile, part of the parameters of the watermark generation and detection
+networks are shared, which makes the detection network achieve a high accuracy
+very efficiently. Experiments show that our algorithm ensures high detection
+accuracy with minimal impact on generation and detection speed, due to the
+small parameter size of both networks. Additionally, our subsequent analysis
+demonstrates the difficulty of reverting the watermark generation rules from
+the detection network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Miao, Yee Whye Teh, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in large language models (LLMs), especially the invention
+of chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning
+problems. However, even the strongest LLMs are still struggling with more
+complicated problems that require non-linear thinking and multi-step reasoning.
+In this work, we explore whether LLMs have the ability to recognize their own
+errors, without resorting to external resources. In particular, we investigate
+whether they can be used to identify individual errors within a step-by-step
+reasoning. To this end, we propose a zero-shot verification scheme to recognize
+such errors. We then use this verification scheme to improve question-answering
+performance, by using it to perform weighted voting on different generated
+answers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and
+find that it successfully recognizes errors and, in turn, increases final
+predictive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AsdKB: A Chinese Knowledge Base for the Early Screening and Diagnosis of
+  Autism Spectrum Disorder <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxing Wu, Xudong Cao, Yipeng Zhu, Feiyue Wu, Tianling Gong, Yuxiang Wang, Shenqi Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To easily obtain the knowledge about autism spectrum disorder and help its
+early screening and diagnosis, we create AsdKB, a Chinese knowledge base on
+autism spectrum disorder. The knowledge base is built on top of various
+sources, including 1) the disease knowledge from SNOMED CT and ICD-10 clinical
+descriptions on mental and behavioural disorders, 2) the diagnostic knowledge
+from DSM-5 and different screening tools recommended by social organizations
+and medical institutes, and 3) the expert knowledge on professional physicians
+and hospitals from the Web. AsdKB contains both ontological and factual
+knowledge, and is accessible as Linked Data at https://w3id.org/asdkb/. The
+potential applications of AsdKB are question answering, auxiliary diagnosis,
+and expert recommendation, and we illustrate them with a prototype which can be
+accessed at http://asdkb.org.cn/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, Accepted by the Resource Track of ISWC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16125v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16125v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Based on powerful Large Language Models (LLMs), recent generative Multimodal
+Large Language Models (MLLMs) have gained prominence as a pivotal research
+area, exhibiting remarkable capability for both comprehension and generation.
+In this work, we address the evaluation of generative comprehension in MLLMs as
+a preliminary step towards a comprehensive assessment of generative models, by
+introducing a benchmark named SEED-Bench. SEED-Bench consists of 19K multiple
+choice questions with accurate human annotations (x 6 larger than existing
+benchmarks), which spans 12 evaluation dimensions including the comprehension
+of both the image and video modality. We develop an advanced pipeline for
+generating multiple-choice questions that target specific evaluation
+dimensions, integrating both automatic filtering and manual verification
+processes. Multiple-choice questions with groundtruth options derived from
+human annotation enables an objective and efficient assessment of model
+performance, eliminating the need for human or GPT intervention during
+evaluation. We further evaluate the performance of 18 models across all 12
+dimensions, covering both the spatial and temporal understanding. By revealing
+the limitations of existing MLLMs through evaluation results, we aim for
+SEED-Bench to provide insights for motivating future research. We will launch
+and consistently maintain a leaderboard to provide a platform for the community
+to assess and investigate model capability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report; Project released at:
+  https://github.com/AILab-CVC/SEED-Bench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs4OL: Large Language Models for Ontology Learning <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamed Babaei Giglou, Jennifer D'Souza, Sören Auer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the LLMs4OL approach, which utilizes Large Language Models (LLMs)
+for Ontology Learning (OL). LLMs have shown significant advancements in natural
+language processing, demonstrating their ability to capture complex language
+patterns in different knowledge domains. Our LLMs4OL paradigm investigates the
+following hypothesis: \textit{Can LLMs effectively apply their language pattern
+capturing capability to OL, which involves automatically extracting and
+structuring knowledge from natural language text?} To test this hypothesis, we
+conduct a comprehensive evaluation using the zero-shot prompting method. We
+evaluate nine different LLM model families for three main OL tasks: term
+typing, taxonomy discovery, and extraction of non-taxonomic relations.
+Additionally, the evaluations encompass diverse genres of ontological
+knowledge, including lexicosemantic knowledge in WordNet, geographical
+knowledge in GeoNames, and medical knowledge in UMLS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages main content, 27 pages overall, 2 Figures, accepted for
+  publication at ISWC 2023 research track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Evaluation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03109v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03109v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupeng Chang, Xu Wang, Jindong Wang, Yuan Wu, Linyi Yang, Kaijie Zhu, Hao Chen, Xiaoyuan Yi, Cunxiang Wang, Yidong Wang, Wei Ye, Yue Zhang, Yi Chang, Philip S. Yu, Qiang Yang, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are gaining increasing popularity in both
+academia and industry, owing to their unprecedented performance in various
+applications. As LLMs continue to play a vital role in both research and daily
+use, their evaluation becomes increasingly critical, not only at the task
+level, but also at the society level for better understanding of their
+potential risks. Over the past years, significant efforts have been made to
+examine LLMs from various perspectives. This paper presents a comprehensive
+review of these evaluation methods for LLMs, focusing on three key dimensions:
+what to evaluate, where to evaluate, and how to evaluate. Firstly, we provide
+an overview from the perspective of evaluation tasks, encompassing general
+natural language processing tasks, reasoning, medical usage, ethics,
+educations, natural and social sciences, agent applications, and other areas.
+Secondly, we answer the `where' and `how' questions by diving into the
+evaluation methods and benchmarks, which serve as crucial components in
+assessing performance of LLMs. Then, we summarize the success and failure cases
+of LLMs in different tasks. Finally, we shed light on several future challenges
+that lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to
+researchers in the realm of LLMs evaluation, thereby aiding the development of
+more proficient LLMs. Our key point is that evaluation should be treated as an
+essential discipline to better assist the development of LLMs. We consistently
+maintain the related open-source materials at:
+https://github.com/MLGroupJLU/LLM-eval-survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages; a major update to include more recent works;
+  https://llm-eval.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Semantically Enriched Embeddings for Knowledge Graph Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehwish Alam, Frank van Harmelen, Maribel Acosta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embedding based Knowledge Graph (KG) Completion has gained much attention
+over the past few years. Most of the current algorithms consider a KG as a
+multidirectional labeled graph and lack the ability to capture the semantics
+underlying the schematic information. In a separate development, a vast amount
+of information has been captured within the Large Language Models (LLMs) which
+has revolutionized the field of Artificial Intelligence. KGs could benefit from
+these LLMs and vice versa. This vision paper discusses the existing algorithms
+for KG completion based on the variations for generating KG embeddings. It
+starts with discussing various KG completion algorithms such as transductive
+and inductive link prediction and entity type prediction algorithms. It then
+moves on to the algorithms utilizing type information within the KGs, LLMs, and
+finally to algorithms capturing the semantics represented in different
+description logic axioms. We conclude the paper with a critical reflection on
+the current state of work in the community and give recommendations for future
+directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improve Event Extraction via Self-Training with Gradient Guidance <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.12490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.12490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyang Xu, Jay-Yoon Lee, Lifu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data scarcity has been the main factor that hinders the progress of event
+extraction. To overcome this issue, we propose a Self-Training with Feedback
+(STF) framework that leverages the large-scale unlabeled data and acquires
+feedback for each new event prediction from the unlabeled data by comparing it
+to the Abstract Meaning Representation (AMR) graph of the same sentence.
+Specifically, STF consists of (1) a base event extraction model trained on
+existing event annotations and then applied to large-scale unlabeled corpora to
+predict new event mentions as pseudo training samples, and (2) a novel scoring
+model that takes in each new predicted event trigger, an argument, its argument
+role, as well as their paths in the AMR graph to estimate a compatibility score
+indicating the correctness of the pseudo label. The compatibility scores
+further act as feedback to encourage or discourage the model learning on the
+pseudo labels during self-training. Experimental results on three benchmark
+datasets, including ACE05-E, ACE05-E+, and ERE, demonstrate the effectiveness
+of the STF framework on event extraction, especially event argument extraction,
+with significant performance gain over the base event extraction models and
+strong baselines. Our experimental analysis further shows that STF is a generic
+framework as it can be applied to improve most, if not all, event extraction
+models by leveraging large-scale unlabeled data, even when high-quality AMR
+graph annotations are not available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DePA: Improving Non-autoregressive Machine Translation with
+  Dependency-Aware Decoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.16266v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.16266v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaao Zhan, Qian Chen, Boxing Chen, Wen Wang, Yu Bai, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-autoregressive machine translation (NAT) models have lower translation
+quality than autoregressive translation (AT) models because NAT decoders do not
+depend on previous target tokens in the decoder input. We propose a novel and
+general Dependency-Aware Decoder (DePA) to enhance target dependency modeling
+in the decoder of fully NAT models from two perspectives: decoder
+self-attention and decoder input. First, we propose an autoregressive
+forward-backward pre-training phase before NAT training, which enables the NAT
+decoder to gradually learn bidirectional target dependencies for the final NAT
+training. Second, we transform the decoder input from the source language
+representation space to the target language representation space through a
+novel attentive transformation process, which enables the decoder to better
+capture target dependencies. DePA can be applied to any fully NAT models.
+Extensive experiments show that DePA consistently improves highly competitive
+and state-of-the-art fully NAT models on widely used WMT and IWSLT benchmarks
+by up to 1.88 BLEU gain, while maintaining the inference latency comparable to
+other fully NAT models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Technical report: Graph Neural Networks go Grammatical 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01590v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01590v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Piquenot, Aldo Moscatelli, Maxime Bérar, Pierre Héroux, Romain raveaux, Jean-Yves Ramel, Sébastien Adam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a framework to formally link a fragment of an algebraic
+language to a Graph Neural Network (GNN). It relies on Context Free Grammars
+(CFG) to organise algebraic operations into generative rules that can be
+translated into a GNN layer model. Since the rules and variables of a CFG
+directly derived from a language contain redundancies, a grammar reduction
+scheme is presented making tractable the translation into a GNN layer. Applying
+this strategy, a grammar compliant with the third-order Weisfeiler-Lehman
+(3-WL) test is defined from MATLANG. From this 3-WL CFG, we derive a provably
+3-WL GNN model called G$^2$N$^2$. Moreover, this grammatical approach allows us
+to provide algebraic formulas to count the cycles of length up to six and
+chordal cycles at the edge level, which enlightens the counting power of 3-WL.
+Several experiments illustrate that G$^2$N$^2$ efficiently outperforms other
+3-WL GNNs on many downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting <span class="highlight-title">Prompt</span> for Few-shot Table-to-Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12468v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12468v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixin Guo, Minyxuan Yan, Jiexing Qi, Jianping Zhou, Ziwei He, Zhouhan Lin, Guanjie Zheng, Xinbing Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained language models (PLMs) have made remarkable progress in
+table-to-text generation tasks. However, the lack of domain-specific knowledge
+makes it challenging to bridge the topological gap between tabular data and
+text, especially in real-world applications with limited resources. To mitigate
+the limitation of insufficient labeled data, we propose a novel framework:
+Adapt-Prompt-to-Generate (AdaPTGen). The core insight of AdaPTGen is to adapt
+prompt templates of domain-specific knowledge into the model, which brings at
+least three benefits: (1) it injects representation of normal table-related
+descriptions to bridge the topological gap between tabular data and texts; (2)
+it enables us to use large amounts of unlabeled domain-specific knowledge
+fully, which can alleviate the PLMs' inherent shortcomings of lacking domain
+knowledge; (3) it allows us to design various tasks to explore the
+domain-specific knowledge. Extensive experiments and analyses are conducted on
+three open-domain few-shot natural language generation (NLG) data sets: Humans,
+Songs, and Books. Compared to previous state-of-the-art approaches, our model
+achieves superior performance in terms of both fluency and accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2302.04415</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models are Strong Zero-Shot Retriever 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Shen, Guodong Long, Xiubo Geng, Chongyang Tao, Tianyi Zhou, Daxin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a simple method that applies a large language model
+(LLM) to large-scale retrieval in zero-shot scenarios. Our method, the Language
+language model as Retriever (LameR), is built upon no other neural models but
+an LLM, while breaking brute-force combinations of retrievers with LLMs and
+lifting the performance of zero-shot retrieval to be very competitive on
+benchmark datasets. Essentially, we propose to augment a query with its
+potential answers by prompting LLMs with a composition of the query and the
+query's in-domain candidates. The candidates, regardless of correct or wrong,
+are obtained by a vanilla retrieval procedure on the target collection. As a
+part of the prompts, they are likely to help LLM generate more precise answers
+by pattern imitation or candidate summarization. Even if all the candidates are
+wrong, the prompts at least make LLM aware of in-collection patterns and
+genres. Moreover, due to the low performance of a self-supervised retriever,
+the LLM-based query augmentation becomes less effective as the retriever
+bottlenecks the whole pipeline. Therefore, we propose to leverage a
+non-parametric lexicon-based method (e.g., BM25) as the retrieval module to
+capture query-document overlap in a literal fashion. As such, LameR makes the
+retrieval procedure transparent to the LLM, thus circumventing the performance
+bottleneck.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention Is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1706.03762v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1706.03762v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dominant sequence transduction models are based on complex recurrent or
+convolutional neural networks in an encoder-decoder configuration. The best
+performing models also connect the encoder and decoder through an attention
+mechanism. We propose a new simple network architecture, the Transformer, based
+solely on attention mechanisms, dispensing with recurrence and convolutions
+entirely. Experiments on two machine translation tasks show these models to be
+superior in quality while being more parallelizable and requiring significantly
+less time to train. Our model achieves 28.4 BLEU on the WMT 2014
+English-to-German translation task, improving over the existing best results,
+including ensembles by over 2 BLEU. On the WMT 2014 English-to-French
+translation task, our model establishes a new single-model state-of-the-art
+BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction
+of the training costs of the best models from the literature. We show that the
+Transformer generalizes well to other tasks by applying it successfully to
+English constituency parsing both with large and limited training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Okapi: Instruction-tuned Large Language Models in Multiple Languages
+  with Reinforcement Learning from Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16039v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16039v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Dac Lai, Chien Van Nguyen, Nghia Trung Ngo, Thuat Nguyen, Franck Dernoncourt, Ryan A. Rossi, Thien Huu Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key technology for the development of large language models (LLMs) involves
+instruction tuning that helps align the models' responses with human
+expectations to realize impressive learning abilities. Two major approaches for
+instruction tuning characterize supervised fine-tuning (SFT) and reinforcement
+learning from human feedback (RLHF), which are currently applied to produce the
+best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for
+research and development efforts, various instruction-tuned open-source LLMs
+have also been introduced recently, e.g., Alpaca, Vicuna, to name a few.
+However, existing open-source LLMs have only been instruction-tuned for English
+and a few popular languages, thus hindering their impacts and accessibility to
+many other languages in the world. Among a few very recent work to explore
+instruction tuning for LLMs in multiple languages, SFT has been used as the
+only approach to instruction-tune LLMs for multiple languages. This has left a
+significant gap for fine-tuned LLMs based on RLHF in diverse languages and
+raised important questions on how RLHF can boost the performance of
+multilingual instruction tuning. To overcome this issue, we present Okapi, the
+first system with instruction-tuned LLMs based on RLHF for multiple languages.
+Okapi introduces instruction and response-ranked data in 26 diverse languages
+to facilitate the experiments and development of future multilingual LLM
+research. We also present benchmark datasets to enable the evaluation of
+generative LLMs in multiple languages. Our experiments demonstrate the
+advantages of RLHF for multilingual instruction over SFT for different base
+models and datasets. Our framework and resources are released at
+https://github.com/nlp-uoregon/Okapi.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Integrated NPL Approach to Sentiment Analysis in Satisfaction <span class="highlight-title">Survey</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11771v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11771v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edson B. Pinto-Luque
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research project aims to apply an integrated approach to natural language
+processing NLP to satisfaction surveys. It will focus on understanding and
+extracting relevant information from survey responses, analyzing feelings, and
+identifying recurring word patterns. NLP techniques will be used to determine
+emotional polarity, classify responses into positive, negative, or neutral
+categories, and use opinion mining to highlight participants opinions. This
+approach will help identify the most relevant aspects for participants and
+understand their opinions in relation to those specific aspects. A key
+component of the research project will be the analysis of word patterns in
+satisfaction survey responses using NPL. This analysis will provide a deeper
+understanding of feelings, opinions, and themes and trends present in
+respondents responses. The results obtained from this approach can be used to
+identify areas for improvement, understand respondents preferences, and make
+strategic decisions based on analysis to improve respondent satisfaction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenAGI: When LLM Meets Domain Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04370v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04370v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingqiang Ge, Wenyue Hua, Kai Mei, Jianchao Ji, Juntao Tan, Shuyuan Xu, Zelong Li, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human intelligence excels at combining basic skills to solve complex tasks.
+This capability is vital for Artificial Intelligence (AI) and should be
+embedded in comprehensive intelligent models, enabling them to harness expert
+models for complex task-solving towards Artificial General Intelligence (AGI).
+Large Language Models (LLMs) show promising learning and reasoning abilities,
+and can effectively use external models, tools or APIs to tackle complex
+problems. In this work, we introduce OpenAGI, an open-source AGI research
+platform designed for multi-step, real-world tasks. Specifically, OpenAGI uses
+a dual strategy, integrating standard benchmark tasks for benchmarking and
+evaluation, and open-ended tasks including more expandable models, tools or
+APIs for creative problem-solving. Tasks are presented as natural language
+queries to the LLM, which then selects and executes appropriate models. We also
+propose a Reinforcement Learning from Task Feedback (RLTF) mechanism that uses
+task results to improve the LLM's ability, which creates a self-improving AI
+feedback loop. While we acknowledge that AGI is a broad and multifaceted
+research challenge with no singularly defined solution path, the integration of
+LLMs with domain-specific expert models, inspired by mirroring the blend of
+general and specialized intelligence in humans, offers a promising approach
+towards AGI. We are open-sourcing the OpenAGI project's code, dataset,
+benchmarks, evaluation methods, and demo to foster community involvement in AGI
+advancement: https://github.com/agiresearch/OpenAGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">90</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELIXR: Towards a general purpose X-ray artificial intelligence system
+  through alignment of large language models and radiology vision encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shawn Xu, Lin Yang, Christopher Kelly, Marcin Sieniek, Timo Kohlberger, Martin Ma, Wei-Hung Weng, Attila Kiraly, Sahar Kazemzadeh, Zakkai Melamed, Jungyeon Park, Patricia Strachan, Yun Liu, Chuck Lau, Preeti Singh, Christina Chen, Mozziyar Etemadi, Sreenivasa Raju Kalidindi, Yossi Matias, Katherine Chou, Greg S. Corrado, Shravya Shetty, Daniel Tse, Shruthi Prabhakara, Daniel Golden, Rory Pilgrim, Krish Eswaran, Andrew Sellergren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our approach, which we call Embeddings for Language/Image-aligned X-Rays, or
+ELIXR, leverages a language-aligned image encoder combined or grafted onto a
+fixed LLM, PaLM 2, to perform a broad range of tasks. We train this lightweight
+adapter architecture using images paired with corresponding free-text radiology
+reports from the MIMIC-CXR dataset. ELIXR achieved state-of-the-art performance
+on zero-shot chest X-ray (CXR) classification (mean AUC of 0.850 across 13
+findings), data-efficient CXR classification (mean AUCs of 0.893 and 0.898
+across five findings (atelectasis, cardiomegaly, consolidation, pleural
+effusion, and pulmonary edema) for 1% (~2,200 images) and 10% (~22,000 images)
+training data), and semantic search (0.76 normalized discounted cumulative gain
+(NDCG) across nineteen queries, including perfect retrieval on twelve of them).
+Compared to existing data-efficient methods including supervised contrastive
+learning (SupCon), ELIXR required two orders of magnitude less data to reach
+similar performance. ELIXR also showed promise on CXR vision-language tasks,
+demonstrating overall accuracies of 58.7% and 62.5% on visual question
+answering and report quality assurance tasks, respectively. These results
+suggest that ELIXR is a robust and versatile approach to CXR AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patched Denoising Diffusion Models For High-Resolution Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Ding, Mengqi Zhang, Jiajun Wu, Zhuowen Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an effective denoising diffusion model for generating
+high-resolution images (e.g., 1024$\times$512), trained on small-size image
+patches (e.g., 64$\times$64). We name our algorithm Patch-DM, in which a new
+feature collage strategy is designed to avoid the boundary artifact when
+synthesizing large-size images. Feature collage systematically crops and
+combines partial features of the neighboring patches to predict the features of
+a shifted image patch, allowing the seamless generation of the entire image due
+to the overlap in the patch feature space. Patch-DM produces high-quality image
+synthesis results on our newly collected dataset of nature images
+(1024$\times$512), as well as on standard benchmarks of smaller sizes
+(256$\times$256), including LSUN-Bedroom, LSUN-Church, and FFHQ. We compare our
+method with previous patch-based generation methods and achieve
+state-of-the-art FID scores on all four datasets. Further, Patch-DM also
+reduces memory complexity compared to the classic diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More Context, Less Distraction: Visual Classification by Inferring and
+  Conditioning on Contextual Attributes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang An, Sicheng Zhu, Michael-Andrei Panaitescu-Liess, Chaithanya Kumar Mummadi, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP, as a foundational vision language model, is widely used in zero-shot
+image classification due to its ability to understand various visual concepts
+and natural language descriptions. However, how to fully leverage CLIP's
+unprecedented human-like understanding capabilities to achieve better zero-shot
+classification is still an open question. This paper draws inspiration from the
+human visual perception process: a modern neuroscience view suggests that in
+classifying an object, humans first infer its class-independent attributes
+(e.g., background and orientation) which help separate the foreground object
+from the background, and then make decisions based on this information.
+Inspired by this, we observe that providing CLIP with contextual attributes
+improves zero-shot classification and mitigates reliance on spurious features.
+We also observe that CLIP itself can reasonably infer the attributes from an
+image. With these observations, we propose a training-free, two-step zero-shot
+classification method named PerceptionCLIP. Given an image, it first infers
+contextual attributes (e.g., background) and then performs object
+classification conditioning on them. Our experiments show that PerceptionCLIP
+achieves better generalization, group robustness, and better interpretability.
+For example, PerceptionCLIP with ViT-L/14 improves the worst group accuracy by
+16.5% on the Waterbirds dataset and by 3.5% on CelebA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting DETR <span class="highlight-title">Pre-train</span>ing for Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Ma, Weicong Liang, Yiduo Hao, Bohan Chen, Xiangyu Yue, Chao Zhang, Yuhui Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by that DETR-based approaches have established new records on COCO
+detection and segmentation benchmarks, many recent endeavors show increasing
+interest in how to further improve DETR-based approaches by pre-training the
+Transformer in a self-supervised manner while keeping the backbone frozen. Some
+studies already claimed significant improvements in accuracy. In this paper, we
+take a closer look at their experimental methodology and check if their
+approaches are still effective on the very recent state-of-the-art such as
+$\mathcal{H}$-Deformable-DETR. We conduct thorough experiments on COCO object
+detection tasks to study the influence of the choice of pre-training datasets,
+localization, and classification target generation schemes. Unfortunately, we
+find the previous representative self-supervised approach such as DETReg, fails
+to boost the performance of the strong DETR-based approaches on full data
+regimes. We further analyze the reasons and find that simply combining a more
+accurate box predictor and Objects$365$ benchmark can significantly improve the
+results in follow-up experiments. We demonstrate the effectiveness of our
+approach by achieving strong object detection results of AP=$59.3\%$ on COCO
+val set, which surpasses $\mathcal{H}$-Deformable-DETR + Swin-L by +$1.4\%$.
+Last, we generate a series of synthetic pre-training datasets by combining the
+very recent image-to-text captioning models (LLaVA) and text-to-image
+generative models (SDXL). Notably, pre-training on these synthetic datasets
+leads to notable improvements in object detection performance. Looking ahead,
+we anticipate substantial advantages through the future expansion of the
+synthetic pre-training dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Season and Solar Specificity into Renderings made by a
+  NeRF Architecture using Satellite Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Gableman, Avinash Kak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a result of Shadow NeRF and Sat-NeRF, it is possible to take the solar
+angle into account in a NeRF-based framework for rendering a scene from a novel
+viewpoint using satellite images for training. Our work extends those
+contributions and shows how one can make the renderings season-specific. Our
+main challenge was creating a Neural Radiance Field (NeRF) that could render
+seasonal features independently of viewing angle and solar angle while still
+being able to render shadows. We teach our network to render seasonal features
+by introducing one more input variable -- time of the year. However, the small
+training datasets typical of satellite imagery can introduce ambiguities in
+cases where shadows are present in the same location for every image of a
+particular season. We add additional terms to the loss function to discourage
+the network from using seasonal features for accounting for shadows. We show
+the performance of our network on eight Areas of Interest containing images
+captured by the Maxar WorldView-3 satellite. This evaluation includes tests
+measuring the ability of our framework to accurately render novel views,
+generate height maps, predict shadows, and specify seasonal features
+independently from shadows. Our ablation studies justify the choices made for
+network design parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 17 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Spatial Distribution of Long-Term Trackers Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincenzo Mariano Scarrica, Antonino Staiano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-Term tracking is a hot topic in Computer Vision. In this context,
+competitive models are presented every year, showing a constant growth rate in
+performances, mainly measured in standardized protocols as Visual Object
+Tracking (VOT) and Object Tracking Benchmark (OTB). Fusion-trackers strategy
+has been applied over last few years for overcoming the known re-detection
+problem, turning out to be an important breakthrough. Following this approach,
+this work aims to generalize the fusion concept to an arbitrary number of
+trackers used as baseline trackers in the pipeline, leveraging a learning phase
+to better understand how outcomes correlate with each other, even when no
+target is present. A model and data independence conjecture will be evidenced
+in the manuscript, yielding a recall of 0.738 on LTB-50 dataset when learning
+from VOT-LT2022, and 0.619 by reversing the two datasets. In both cases,
+results are strongly competitive with state-of-the-art and recall turns out to
+be the first on the podium.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hyper-pixel-wise Contrastive Learning Augmented Segmentation Network
+  for Old Landslide Detection Using High-Resolution Remote Sensing Images and
+  Digital Elevation Model Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Zhou, Yuexing Peng, Wei Li, Junchuan Yu, Daqing Ge, Wei Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a harzard disaster, landslide often brings tremendous losses to humanity,
+so it's necessary to achieve reliable detection of landslide. However, the
+problems of visual blur and small-sized dataset cause great challenges for old
+landslide detection task when using remote sensing data. To reliably extract
+semantic features, a hyper-pixel-wise contrastive learning augmented
+segmentation network (HPCL-Net) is proposed, which augments the local salient
+feature extraction from the boundaries of landslides through HPCL and fuses the
+heterogeneous infromation in the semantic space from High-Resolution Remote
+Sensing Images and Digital Elevation Model Data data. For full utilization of
+the precious samples, a global hyper-pixel-wise sample pair queues-based
+contrastive learning method, which includes the construction of global queues
+that store hyper-pixel-wise samples and the updating scheme of a momentum
+encoder, is developed, reliably enhancing the extraction ability of semantic
+features. The proposed HPCL-Net is evaluated on a Loess Plateau old landslide
+dataset and experiment results show that the model greatly improves the
+reliablity of old landslide detection compared to the previous old landslide
+segmentation model, where mIoU metric is increased from 0.620 to 0.651,
+Landslide IoU metric is increased from 0.334 to 0.394 and F1-score metric is
+increased from 0.501 to 0.565.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid Approach To Real-Time Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincenzo Mariano Scarrica, Ciro Panariello, Alessio Ferone, Antonino Staiano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Object Tracking, also known as Multi-Target Tracking, is a significant
+area of computer vision that has many uses in a variety of settings. The
+development of deep learning, which has encouraged researchers to propose more
+and more work in this direction, has significantly impacted the scientific
+advancement around the study of tracking as well as many other domains related
+to computer vision. In fact, all of the solutions that are currently
+state-of-the-art in the literature and in the tracking industry, are built on
+top of deep learning methodologies that produce exceptionally good results.
+Deep learning is enabled thanks to the ever more powerful technology
+researchers can use to handle the significant computational resources demanded
+by these models. However, when real-time is a main requirement, developing a
+tracking system without being constrained by expensive hardware support with
+enormous computational resources is necessary to widen tracking applications in
+real-world contexts. To this end, a compromise is to combine powerful deep
+strategies with more traditional approaches to favor considerably lower
+processing solutions at the cost of less accurate tracking results even though
+suitable for real-time domains. Indeed, the present work goes in that
+direction, proposing a hybrid strategy for real-time multi-target tracking that
+combines effectively a classical optical flow algorithm with a deep learning
+architecture, targeted to a human-crowd tracking system exhibiting a desirable
+trade-off between performance in tracking precision and computational costs.
+The developed architecture was experimented with different settings, and
+yielded a MOTA of 0.608 out of the compared state-of-the-art 0.549 results, and
+about half the running time when introducing the optical flow phase, achieving
+almost the same performance in terms of accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tirtha -- An Automated Platform to Crowdsource Images and Create 3D
+  Models of Heritage Sites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jyotirmaya Shivottam, Subhankar Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital preservation of Cultural Heritage (CH) sites is crucial to protect
+them against damage from natural disasters or human activities. Creating 3D
+models of CH sites has become a popular method of digital preservation thanks
+to advancements in computer vision and photogrammetry. However, the process is
+time-consuming, expensive, and typically requires specialized equipment and
+expertise, posing challenges in resource-limited developing countries.
+Additionally, the lack of an open repository for 3D models hinders research and
+public engagement with their heritage. To address these issues, we propose
+Tirtha, a web platform for crowdsourcing images of CH sites and creating their
+3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and
+Multi-View Stereo (MVS) techniques. It is modular, extensible and
+cost-effective, allowing for the incorporation of new techniques as
+photogrammetry advances. Tirtha is accessible through a web interface at
+https://tirtha.niser.ac.in and can be deployed on-premise or in a cloud
+environment. In our case studies, we demonstrate the pipeline's effectiveness
+by creating 3D models of temples in Odisha, India, using crowdsourced images.
+These models are available for viewing, interaction, and download on the Tirtha
+website. Our work aims to provide a dataset of crowdsourced images and 3D
+reconstructions for research in computer vision, heritage conservation, and
+related domains. Overall, Tirtha is a step towards democratizing digital
+preservation, primarily in resource-limited developing countries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 28th International ACM Conference on 3D Web
+  Technology (Web3D 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CMUNeXt: An Efficient Medical Image Segmentation Network based on Large
+  Kernel and Skip Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenghe Tang, Jianrui Ding, Lingtao Wang, Chunping Ning, S. Kevin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The U-shaped architecture has emerged as a crucial paradigm in the design of
+medical image segmentation networks. However, due to the inherent local
+limitations of convolution, a fully convolutional segmentation network with
+U-shaped architecture struggles to effectively extract global context
+information, which is vital for the precise localization of lesions. While
+hybrid architectures combining CNNs and Transformers can address these issues,
+their application in real medical scenarios is limited due to the computational
+resource constraints imposed by the environment and edge devices. In addition,
+the convolutional inductive bias in lightweight networks adeptly fits the
+scarce medical data, which is lacking in the Transformer based network. In
+order to extract global context information while taking advantage of the
+inductive bias, we propose CMUNeXt, an efficient fully convolutional
+lightweight medical image segmentation network, which enables fast and accurate
+auxiliary diagnosis in real scene scenarios. CMUNeXt leverages large kernel and
+inverted bottleneck design to thoroughly mix distant spatial and location
+information, efficiently extracting global context information. We also
+introduce the Skip-Fusion block, designed to enable smooth skip-connections and
+ensure ample feature fusion. Experimental results on multiple medical image
+datasets demonstrate that CMUNeXt outperforms existing heavyweight and
+lightweight medical image segmentation networks in terms of segmentation
+performance, while offering a faster inference speed, lighter weights, and a
+reduced computational cost. The code is available at
+https://github.com/FengheTan9/CMUNeXt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounded Image Text Matching with Mismatched Relation Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Wu, Yana Wei, Haozhe Wang, Yongfei Liu, Sibei Yang, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Grounded Image Text Matching with Mismatched Relation
+(GITM-MR), a novel visual-linguistic joint task that evaluates the relation
+understanding capabilities of transformer-based pre-trained models. GITM-MR
+requires a model to first determine if an expression describes an image, then
+localize referred objects or ground the mismatched parts of the text. We
+provide a benchmark for evaluating pre-trained models on this task, with a
+focus on the challenging settings of limited data and out-of-distribution
+sentence lengths. Our evaluation demonstrates that pre-trained models lack data
+efficiency and length generalization ability. To address this, we propose the
+Relation-sensitive Correspondence Reasoning Network (RCRN), which incorporates
+relation-aware reasoning via bi-directional message propagation guided by
+language structure. RCRN can be interpreted as a modular program and delivers
+strong performance in both length generalization and data efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TeachCLIP: Multi-Grained Teaching for Efficient Text-to-Video Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaibin Tian, Ruixiang Zhao, Hu Hu, Runquan Xie, Fengzong Lian, Zhanhui Kang, Xirong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For text-to-video retrieval (T2VR), which aims to retrieve unlabeled videos
+by ad-hoc textual queries, CLIP-based methods are dominating. Compared to
+CLIP4Clip which is efficient and compact, the state-of-the-art models tend to
+compute video-text similarity by fine-grained cross-modal feature interaction
+and matching, putting their scalability for large-scale T2VR into doubt. For
+efficient T2VR, we propose TeachCLIP with multi-grained teaching to let a
+CLIP4Clip based student network learn from more advanced yet computationally
+heavy models such as X-CLIP, TS2-Net and X-Pool . To improve the student's
+learning capability, we add an Attentional frame-Feature Aggregation (AFA)
+block, which by design adds no extra storage/computation overhead at the
+retrieval stage. While attentive weights produced by AFA are commonly used for
+combining frame-level features, we propose a novel use of the weights to let
+them imitate frame-text relevance estimated by the teacher network. As such,
+AFA provides a fine-grained learning (teaching) channel for the student
+(teacher). Extensive experiments on multiple public datasets justify the
+viability of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Generalization in Visual Reinforcement Learning via
+  Conflict-aware Gradient Agreement Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siao Liu, Zhaoyu Chen, Yang Liu, Yuzheng Wang, Dingkang Yang, Zhile Zhao, Ziqing Zhou, Xie Yi, Wei Li, Wenqiang Zhang, Zhongxue Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a policy with great generalization to unseen environments remains
+challenging but critical in visual reinforcement learning. Despite the success
+of augmentation combination in the supervised learning generalization, naively
+applying it to visual RL algorithms may damage the training efficiency,
+suffering from serve performance degradation. In this paper, we first conduct
+qualitative analysis and illuminate the main causes: (i) high-variance gradient
+magnitudes and (ii) gradient conflicts existed in various augmentation methods.
+To alleviate these issues, we propose a general policy gradient optimization
+framework, named Conflict-aware Gradient Agreement Augmentation (CG2A), and
+better integrate augmentation combination into visual RL algorithms to address
+the generalization bias. In particular, CG2A develops a Gradient Agreement
+Solver to adaptively balance the varying gradient magnitudes, and introduces a
+Soft Gradient Surgery strategy to alleviate the gradient conflicts. Extensive
+experiments demonstrate that CG2A significantly improves the generalization
+performance and sample efficiency of visual RL algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by iccv2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Centric Diet: Effective Multi-center <span class="highlight-title">Dataset</span> Pruning for Medical
+  Image Segmentation <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkang He, Mingjin Chen, Zhijing Yang, Yongyi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper seeks to address the dense labeling problems where a significant
+fraction of the dataset can be pruned without sacrificing much accuracy. We
+observe that, on standard medical image segmentation benchmarks, the loss
+gradient norm-based metrics of individual training examples applied in image
+classification fail to identify the important samples. To address this issue,
+we propose a data pruning method by taking into consideration the training
+dynamics on target regions using Dynamic Average Dice (DAD) score. To the best
+of our knowledge, we are among the first to address the data importance in
+dense labeling tasks in the field of medical image analysis, making the
+following contributions: (1) investigating the underlying causes with rigorous
+empirical analysis, and (2) determining effective data pruning approach in
+dense labeling problems. Our solution can be used as a strong yet simple
+baseline to select important examples for medical image segmentation with
+combined data sources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML workshops 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Noisy-Label Learning by Implicit Dicriminative Approximation
+  with Partial Label Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengbei Liu, Yuanhong Chen, Chong Wang, Yuyuan Liu, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The learning with noisy labels has been addressed with both discriminative
+and generative models. Although discriminative models have dominated the field
+due to their simpler modeling and more efficient computational training
+processes, generative models offer a more effective means of disentangling
+clean and noisy labels and improving the estimation of the label transition
+matrix. However, generative approaches maximize the joint likelihood of noisy
+labels and data using a complex formulation that only indirectly optimizes the
+model of interest associating data and clean labels. Additionally, these
+approaches rely on generative models that are challenging to train and tend to
+use uninformative clean label priors. In this paper, we propose a new
+generative noisy-label learning approach that addresses these three issues.
+First, we propose a new model optimisation that directly associates data and
+clean labels. Second, the generative model is implicitly estimated using a
+discriminative model, eliminating the inefficient training of a generative
+model. Third, we propose a new informative label prior inspired by partial
+label learning as supervision signal for noisy label learning. Extensive
+experiments on several noisy-label benchmarks demonstrate that our generative
+model provides state-of-the-art results while maintaining a similar
+computational complexity as discriminative models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable End-to-End Driving Model for Implicit Scene Understanding <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Sun, Xiaonian Wang, Yangyang Zhang, Jiagui Tang, Xiaqiang Tang, Jing Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driving scene understanding is to obtain comprehensive scene information
+through the sensor data and provide a basis for downstream tasks, which is
+indispensable for the safety of self-driving vehicles. Specific perception
+tasks, such as object detection and scene graph generation, are commonly used.
+However, the results of these tasks are only equivalent to the characterization
+of sampling from high-dimensional scene features, which are not sufficient to
+represent the scenario. In addition, the goal of perception tasks is
+inconsistent with human driving that just focuses on what may affect the
+ego-trajectory. Therefore, we propose an end-to-end Interpretable Implicit
+Driving Scene Understanding (II-DSU) model to extract implicit high-dimensional
+scene features as scene understanding results guided by a planning module and
+to validate the plausibility of scene understanding using auxiliary perception
+tasks for visualization. Experimental results on CARLA benchmarks show that our
+approach achieves the new state-of-the-art and is able to obtain scene features
+that embody richer scene information relevant to driving, enabling superior
+performance of the downstream planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 26th IEEE International Conference on Intelligent
+  Transportation Systems (ITSC 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory Encoding Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huzheng Yang, James Gee, Jianbo Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore a new class of brain encoding model by adding memory-related
+information as input. Memory is an essential brain mechanism that works
+alongside visual stimuli. During a vision-memory cognitive task, we found the
+non-visual brain is largely predictable using previously seen images. Our
+Memory Encoding Model (Mem) won the Algonauts 2023 visual brain competition
+even without model ensemble (single model score 66.8, ensemble score 70.8). Our
+ensemble model without memory input (61.4) can also stand a 3rd place.
+Furthermore, we observe periodic delayed brain response correlated to 6th-7th
+prior image, and hippocampus also showed correlated activity timed with this
+periodicity. We conjuncture that the periodic replay could be related to memory
+mechanism to enhance the working memory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment
+  for Markup-to-Image Generation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guojin Zhong, Jin Yuan, Pan Wang, Kailun Yang, Weili Guan, Zhiyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently rising markup-to-image generation poses greater challenges as
+compared to natural image generation, due to its low tolerance for errors as
+well as the complex sequence and context correlations between markup and
+rendered image. This paper proposes a novel model named "Contrast-augmented
+Diffusion Model with Fine-grained Sequence Alignment" (FSA-CDM), which
+introduces contrastive positive/negative samples into the diffusion model to
+boost performance for markup-to-image generation. Technically, we design a
+fine-grained cross-modal alignment module to well explore the sequence
+similarity between the two modalities for learning robust feature
+representations. To improve the generalization ability, we propose a
+contrast-augmented diffusion model to explicitly explore positive and negative
+samples by maximizing a novel contrastive variational objective, which is
+mathematically inferred to provide a tighter bound for the model's
+optimization. Moreover, the context-aware cross attention module is developed
+to capture the contextual information within markup language during the
+denoising process, yielding better noise prediction results. Extensive
+experiments are conducted on four benchmark datasets from different domains,
+and the experimental results demonstrate the effectiveness of the proposed
+components in FSA-CDM, significantly exceeding state-of-the-art performance by
+about 2%-12% DTW improvements. The code will be released at
+https://github.com/zgj77/FSACDM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023. The code will be released at
+  https://github.com/zgj77/FSACDM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UCDFormer: Unsupervised Change Detection Using a <span class="highlight-title">Transformer</span>-driven
+  Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingsong Xu, Yilei Shi, Jianhua Guo, Chaojun Ouyang, Xiao Xiang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection (CD) by comparing two bi-temporal images is a crucial task
+in remote sensing. With the advantages of requiring no cumbersome labeled
+change information, unsupervised CD has attracted extensive attention in the
+community. However, existing unsupervised CD approaches rarely consider the
+seasonal and style differences incurred by the illumination and atmospheric
+conditions in multi-temporal images. To this end, we propose a change detection
+with domain shift setting for remote sensing images. Furthermore, we present a
+novel unsupervised CD method using a light-weight transformer, called
+UCDFormer. Specifically, a transformer-driven image translation composed of a
+light-weight transformer and a domain-specific affinity weight is first
+proposed to mitigate domain shift between two images with real-time efficiency.
+After image translation, we can generate the difference map between the
+translated before-event image and the original after-event image. Then, a novel
+reliable pixel extraction module is proposed to select significantly
+changed/unchanged pixel positions by fusing the pseudo change maps of fuzzy
+c-means clustering and adaptive threshold. Finally, a binary change map is
+obtained based on these selected pixel pairs and a binary classifier.
+Experimental results on different unsupervised CD tasks with seasonal and style
+changes demonstrate the effectiveness of the proposed UCDFormer. For example,
+compared with several other related methods, UCDFormer improves performance on
+the Kappa coefficient by more than 12\%. In addition, UCDFormer achieves
+excellent performance for earthquake-induced landslide detection when
+considering large-scale applications. The code is available at
+\url{https://github.com/zhu-xlab/UCDFormer}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 7 figures, IEEE Transactions on Geoscience and Remote
+  Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADS-Cap: A Framework for Accurate and Diverse Stylized Captioning with
+  Unpaired Stylistic Corpora <span class="chip">NLPCC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanzhi Cheng, Zheng Ma, Shi Zong, Jianbing Zhang, Xinyu Dai, Jiajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating visually grounded image captions with specific linguistic styles
+using unpaired stylistic corpora is a challenging task, especially since we
+expect stylized captions with a wide variety of stylistic patterns. In this
+paper, we propose a novel framework to generate Accurate and Diverse Stylized
+Captions (ADS-Cap). Our ADS-Cap first uses a contrastive learning module to
+align the image and text features, which unifies paired factual and unpaired
+stylistic corpora during the training process. A conditional variational
+auto-encoder is then used to automatically memorize diverse stylistic patterns
+in latent space and enhance diversity through sampling. We also design a simple
+but effective recheck module to boost style accuracy by filtering
+style-specific captions. Experimental results on two widely used stylized image
+captioning datasets show that regarding consistency with the image, style
+accuracy and diversity, ADS-Cap achieves outstanding performances compared to
+various baselines. We finally conduct extensive analyses to understand the
+effectiveness of our method. Our code is available at
+https://github.com/njucckevin/ADS-Cap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Natural Language Processing and Chinese Computing (NLPCC)
+  2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DySTreSS: Dynamically Scaled Temperature in <span class="highlight-title">Self-Supervised</span> Contrastive
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siladittya Manna, Soumitri Chattopadhyay, Rakesh Dey, Saumik Bhattacharya, Umapada Pal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contemporary self-supervised contrastive algorithms like SimCLR, MoCo,
+etc., the task of balancing attraction between two semantically similar samples
+and repulsion between two samples from different classes is primarily affected
+by the presence of hard negative samples. While the InfoNCE loss has been shown
+to impose penalties based on hardness, the temperature hyper-parameter is the
+key to regulating the penalties and the trade-off between uniformity and
+tolerance. In this work, we focus our attention to improve the performance of
+InfoNCE loss in SSL by studying the effect of temperature hyper-parameter
+values. We propose a cosine similarity-dependent temperature scaling function
+to effectively optimize the distribution of the samples in the feature space.
+We further analyze the uniformity and tolerance metrics to investigate the
+optimal regions in the cosine similarity space for better optimization.
+Additionally, we offer a comprehensive examination of the behavior of local and
+global structures in the feature space throughout the pre-training phase, as
+the temperature varies. Experimental evidence shows that the proposed framework
+outperforms or is at par with the contrastive loss-based SSL algorithms. We
+believe our work (DySTreSS) on temperature scaling in SSL provides a foundation
+for future research in contrastive learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-task learning for classification, segmentation, reconstruction,
+  and detection on chest CT scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weronika Hryniewska-Guzik, Maria Kędzierska, Przemysław Biecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer and covid-19 have one of the highest morbidity and mortality
+rates in the world. For physicians, the identification of lesions is difficult
+in the early stages of the disease and time-consuming. Therefore, multi-task
+learning is an approach to extracting important features, such as lesions, from
+small amounts of medical data because it learns to generalize better. We
+propose a novel multi-task framework for classification, segmentation,
+reconstruction, and detection. To the best of our knowledge, we are the first
+ones who added detection to the multi-task solution. Additionally, we checked
+the possibility of using two different backbones and different loss functions
+in the segmentation task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>presented at the Polish Conference on Artificial Intelligence
+  (PP-RAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Expert Models for Training Deep Neural Networks in Scarce
+  Data Domains: Application to Offline Handwritten Signature Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Tsourounis, Ilias Theodorakopoulos, Elias N. Zois, George Economou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach to leverage the knowledge of existing
+expert models for training new Convolutional Neural Networks, on domains where
+task-specific data are limited or unavailable. The presented scheme is applied
+in offline handwritten signature verification (OffSV) which, akin to other
+biometric applications, suffers from inherent data limitations due to
+regulatory restrictions. The proposed Student-Teacher (S-T) configuration
+utilizes feature-based knowledge distillation (FKD), combining graph-based
+similarity for local activations with global similarity measures to supervise
+student's training, using only handwritten text data. Remarkably, the models
+trained using this technique exhibit comparable, if not superior, performance
+to the teacher model across three popular signature datasets. More importantly,
+these results are attained without employing any signatures during the feature
+extraction training process. This study demonstrates the efficacy of leveraging
+existing expert models to overcome data scarcity challenges in OffSV and
+potentially other related domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffusePast: Diffusion-based Generative Replay for Class Incremental
+  Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingfan Chen, Yuxi Wang, Pengfei Wang, Xiao Chen, Zhaoxiang Zhang, Zhen Lei, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Class Incremental Semantic Segmentation (CISS) extends the traditional
+segmentation task by incrementally learning newly added classes. Previous work
+has introduced generative replay, which involves replaying old class samples
+generated from a pre-trained GAN, to address the issues of catastrophic
+forgetting and privacy concerns. However, the generated images lack semantic
+precision and exhibit out-of-distribution characteristics, resulting in
+inaccurate masks that further degrade the segmentation performance. To tackle
+these challenges, we propose DiffusePast, a novel framework featuring a
+diffusion-based generative replay module that generates semantically accurate
+images with more reliable masks guided by different instructions (e.g., text
+prompts or edge maps). Specifically, DiffusePast introduces a dual-generator
+paradigm, which focuses on generating old class images that align with the
+distribution of downstream datasets while preserving the structure and layout
+of the original images, enabling more precise masks. To adapt to the novel
+visual concepts of newly added classes continuously, we incorporate class-wise
+token embedding when updating the dual-generator. Moreover, we assign adequate
+pseudo-labels of old classes to the background pixels in the new step images,
+further mitigating the forgetting of previously learned knowledge. Through
+comprehensive experiments, our method demonstrates competitive performance
+across mainstream benchmarks, striking a better balance between the performance
+of old and novel classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>e.g.: 13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Generic: Enhancing Image Captioning with Real-World Knowledge
+  using Vision-Language <span class="highlight-title">Pre-Train</span>ing Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanzhi Cheng, Wenpo Song, Zheng Ma, Wenhao Zhu, Zixuan Zhu, Jianbing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current captioning approaches tend to generate correct but "generic"
+descriptions that lack real-world knowledge, e.g., named entities and
+contextual information. Considering that Vision-Language Pre-Training (VLP)
+models master massive such knowledge from large-scale web-harvested data, it is
+promising to utilize the generalizability of VLP models to incorporate
+knowledge into image descriptions. However, using VLP models faces challenges:
+zero-shot inference suffers from knowledge hallucination that leads to
+low-quality descriptions, but the generic bias in downstream task fine-tuning
+hinders the VLP model from expressing knowledge. To address these concerns, we
+propose a simple yet effective method called Knowledge-guided Replay
+(K-Replay), which enables the retention of pre-training knowledge during
+fine-tuning. Our approach consists of two parts: (1) a knowledge prediction
+task on automatically collected replay exemplars to continuously awaken the VLP
+model's memory about knowledge, thus preventing the model from collapsing into
+the generic pattern; (2) a knowledge distillation constraint to improve the
+faithfulness of generated descriptions hence alleviating the knowledge
+hallucination. To evaluate knowledge-enhanced descriptions, we construct a
+novel captioning benchmark KnowCap, containing knowledge of landmarks, famous
+brands, special foods and movie characters. Experimental results show that our
+approach effectively incorporates knowledge into descriptions, outperforming
+strong VLP baseline by 20.9 points (78.7->99.6) in CIDEr score and 20.5
+percentage points (34.0%->54.5%) in knowledge recognition accuracy. Our code
+and data is available at https://github.com/njucckevin/KnowCap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia (ACMMM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stereo Visual Odometry with Deep Learning-Based Point and Line Feature
+  Matching using an Attention Graph Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenbagaraj Kannapiran, Nalin Bendapudi, Ming-Yuan Yu, Devarth Parikh, Spring Berman, Ankit Vora, Gaurav Pandey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust feature matching forms the backbone for most Visual Simultaneous
+Localization and Mapping (vSLAM), visual odometry, 3D reconstruction, and
+Structure from Motion (SfM) algorithms. However, recovering feature matches
+from texture-poor scenes is a major challenge and still remains an open area of
+research. In this paper, we present a Stereo Visual Odometry (StereoVO)
+technique based on point and line features which uses a novel feature-matching
+mechanism based on an Attention Graph Neural Network that is designed to
+perform well even under adverse weather conditions such as fog, haze, rain, and
+snow, and dynamic lighting conditions such as nighttime illumination and glare
+scenarios. We perform experiments on multiple real and synthetic datasets to
+validate the ability of our method to perform StereoVO under low visibility
+weather and lighting conditions through robust point and line matches. The
+results demonstrate that our method achieves more line feature matches than
+state-of-the-art line matching algorithms, which when complemented with point
+feature matches perform consistently well in adverse weather and dynamic
+lighting conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlearning Spurious Correlations in Chest X-ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Misgina Tsighe Hagos, Kathleen M. Curran, Brian Mac Namee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification models are frequently trained using training
+datasets derived from multiple data sources. While leveraging multiple data
+sources is crucial for achieving model generalization, it is important to
+acknowledge that the diverse nature of these sources inherently introduces
+unintended confounders and other challenges that can impact both model accuracy
+and transparency. A notable confounding factor in medical image classification,
+particularly in musculoskeletal image classification, is skeletal
+maturation-induced bone growth observed during adolescence. We train a deep
+learning model using a Covid-19 chest X-ray dataset and we showcase how this
+dataset can lead to spurious correlations due to unintended confounding
+regions. eXplanation Based Learning (XBL) is a deep learning approach that goes
+beyond interpretability by utilizing model explanations to interactively
+unlearn spurious correlations. This is achieved by integrating interactive user
+feedback, specifically feature annotations. In our study, we employed two
+non-demanding manual feedback mechanisms to implement an XBL-based approach for
+effectively eliminating these spurious correlations. Our results underscore the
+promising potential of XBL in constructing robust models even in the presence
+of confounding factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Discovery Science 2023 conference. arXiv admin note:
+  text overlap with arXiv:2307.06026</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexin Wang, Yujie Zhou, Wenwen Qiang, Ying Ba, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction (HMP) has emerged as a popular research topic due to
+its diverse applications, but it remains a challenging task due to the
+stochastic and aperiodic nature of future poses. Traditional methods rely on
+hand-crafted features and machine learning techniques, which often struggle to
+model the complex dynamics of human motion. Recent deep learning-based methods
+have achieved success by learning spatio-temporal representations of motion,
+but these models often overlook the reliability of motion data. Additionally,
+the temporal and spatial dependencies of skeleton nodes are distinct. The
+temporal relationship captures motion information over time, while the spatial
+relationship describes body structure and the relationships between different
+nodes. In this paper, we propose a novel spatio-temporal branching network
+using incremental information for HMP, which decouples the learning of
+temporal-domain and spatial-domain features, extracts more motion information,
+and achieves complementary cross-domain knowledge learning through knowledge
+distillation. Our approach effectively reduces noise interference and provides
+more expressive information for characterizing motion by separately extracting
+temporal and spatial features. We evaluate our approach on standard HMP
+benchmarks and outperform state-of-the-art methods in terms of prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoPoster: A Highly Automatic and Content-aware Design System for
+  Advertising Poster Generation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinpeng Lin, Min Zhou, Ye Ma, Yifan Gao, Chenxi Fei, Yangjian Chen, Zhang Yu, Tiezheng Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advertising posters, a form of information presentation, combine visual and
+linguistic modalities. Creating a poster involves multiple steps and
+necessitates design experience and creativity. This paper introduces
+AutoPoster, a highly automatic and content-aware system for generating
+advertising posters. With only product images and titles as inputs, AutoPoster
+can automatically produce posters of varying sizes through four key stages:
+image cleaning and retargeting, layout generation, tagline generation, and
+style attribute prediction. To ensure visual harmony of posters, two
+content-aware models are incorporated for layout and tagline generation.
+Moreover, we propose a novel multi-task Style Attribute Predictor (SAP) to
+jointly predict visual style attributes. Meanwhile, to our knowledge, we
+propose the first poster generation dataset that includes visual attribute
+annotations for over 76k posters. Qualitative and quantitative outcomes from
+user studies and experiments substantiate the efficacy of our system and the
+aesthetic superiority of the generated posters compared to other poster
+generation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hand tracking for clinical applications: validation of the Google
+  MediaPipe Hand (GMH) and the depth-enhanced GMH-D frameworks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Amprimo, Giulia Masi, Giuseppe Pettiti, Gabriella Olmo, Lorenzo Priano, Claudia Ferraris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D tracking of hand and fingers movements poses significant
+challenges in computer vision. The potential applications span across multiple
+domains, including human-computer interaction, virtual reality, industry, and
+medicine. While gesture recognition has achieved remarkable accuracy,
+quantifying fine movements remains a hurdle, particularly in clinical
+applications where the assessment of hand dysfunctions and rehabilitation
+training outcomes necessitate precise measurements. Several novel and
+lightweight frameworks based on Deep Learning have emerged to address this
+issue; however, their performance in accurately and reliably measuring fingers
+movements requires validation against well-established gold standard systems.
+In this paper, the aim is to validate the handtracking framework implemented by
+Google MediaPipe Hand (GMH) and an innovative enhanced version, GMH-D, that
+exploits the depth estimation of an RGB-Depth camera to achieve more accurate
+tracking of 3D movements. Three dynamic exercises commonly administered by
+clinicians to assess hand dysfunctions, namely Hand Opening-Closing, Single
+Finger Tapping and Multiple Finger Tapping are considered. Results demonstrate
+high temporal and spectral consistency of both frameworks with the gold
+standard. However, the enhanced GMH-D framework exhibits superior accuracy in
+spatial measurements compared to the baseline GMH, for both slow and fast
+movements. Overall, our study contributes to the advancement of hand tracking
+technology, the establishment of a validation procedure as a good-practice to
+prove efficacy of deep-learning-based hand-tracking, and proves the
+effectiveness of GMH-D as a reliable framework for assessing 3D hand movements
+in clinical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Homography Estimation in Complex Topological Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giacomo D'Amicantonio, Egor Bondarau, Peter H. N. De With
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surveillance videos and images are used for a broad set of applications,
+ranging from traffic analysis to crime detection. Extrinsic camera calibration
+data is important for most analysis applications. However, security cameras are
+susceptible to environmental conditions and small camera movements, resulting
+in a need for an automated re-calibration method that can account for these
+varying conditions. In this paper, we present an automated camera-calibration
+process leveraging a dictionary-based approach that does not require prior
+knowledge on any camera settings. The method consists of a custom
+implementation of a Spatial Transformer Network (STN) and a novel topological
+loss function. Experiments reveal that the proposed method improves the IoU
+metric by up to 12% w.r.t. a state-of-the-art model across five synthetic
+datasets and the World Cup 2014 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Will be published in Intelligent Vehicle Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Generalization of Synthetically Trained Sonar Image
+  Descriptors for Underwater Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivano Donadi, Emilio Olivastri, Daniel Fusaro, Wanmeng Li, Daniele Evangelista, Alberto Pretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous navigation in underwater environments presents challenges due to
+factors such as light absorption and water turbidity, limiting the
+effectiveness of optical sensors. Sonar systems are commonly used for
+perception in underwater operations as they are unaffected by these
+limitations. Traditional computer vision algorithms are less effective when
+applied to sonar-generated acoustic images, while convolutional neural networks
+(CNNs) typically require large amounts of labeled training data that are often
+unavailable or difficult to acquire. To this end, we propose a novel compact
+deep sonar descriptor pipeline that can generalize to real scenarios while
+being trained exclusively on synthetic data. Our architecture is based on a
+ResNet18 back-end and a properly parameterized random Gaussian projection
+layer, whereas input sonar data is enhanced with standard ad-hoc
+normalization/prefiltering techniques. A customized synthetic data generation
+procedure is also presented. The proposed method has been evaluated extensively
+using both synthetic and publicly available real data, demonstrating its
+effectiveness compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for publication at the 14th
+  International Conference on Computer Vision Systems (ICVS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MammoDG: Generalisable Deep Learning Breaks the Limits of Cross-Domain
+  Multi-Center Breast Cancer Screening 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01057v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01057v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijun Yang, Shujun Wang, Lihao Liu, Sarah Hickman, Fiona J Gilbert, Carola-Bibiane Schönlieb, Angelica I. Aviles-Rivero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is a major cause of cancer death among women, emphasising the
+importance of early detection for improved treatment outcomes and quality of
+life. Mammography, the primary diagnostic imaging test, poses challenges due to
+the high variability and patterns in mammograms. Double reading of mammograms
+is recommended in many screening programs to improve diagnostic accuracy but
+increases radiologists' workload. Researchers explore Machine Learning models
+to support expert decision-making. Stand-alone models have shown comparable or
+superior performance to radiologists, but some studies note decreased
+sensitivity with multiple datasets, indicating the need for high generalisation
+and robustness models. This work devises MammoDG, a novel deep-learning
+framework for generalisable and reliable analysis of cross-domain multi-center
+mammography data. MammoDG leverages multi-view mammograms and a novel
+contrastive mechanism to enhance generalisation capabilities. Extensive
+validation demonstrates MammoDG's superiority, highlighting the critical
+importance of domain generalisation for trustworthy mammography analysis in
+imaging protocol variations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Token Pruning in Plain Vision <span class="highlight-title">Transformer</span>s for Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Tang, Bowen Zhang, Jiajun Liu, Fagiu Liu, Yifan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers have achieved leading performance on various visual tasks
+yet still suffer from high computational complexity. The situation deteriorates
+in dense prediction tasks like semantic segmentation, as high-resolution inputs
+and outputs usually imply more tokens involved in computations. Directly
+removing the less attentive tokens has been discussed for the image
+classification task but can not be extended to semantic segmentation since a
+dense prediction is required for every patch. To this end, this work introduces
+a Dynamic Token Pruning (DToP) method based on the early exit of tokens for
+semantic segmentation. Motivated by the coarse-to-fine segmentation process by
+humans, we naturally split the widely adopted auxiliary-loss-based network
+architecture into several stages, where each auxiliary block grades every
+token's difficulty level. We can finalize the prediction of easy tokens in
+advance without completing the entire forward pass. Moreover, we keep $k$
+highest confidence tokens for each semantic category to uphold the
+representative context information. Thus, computational complexity will change
+with the difficulty of the input, akin to the way humans do segmentation.
+Experiments suggest that the proposed DToP architecture reduces on average
+$20\% - 35\%$ of computational cost for current semantic segmentation methods
+based on plain vision transformers without accuracy degradation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WCCNet: Wavelet-integrated CNN with Crossmodal Rearranging Fusion for
+  Fast Multispectral Pedestrian Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingjian Wang, Li Chai, Jiming Chen, Zhiguo Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multispectral pedestrian detection achieves better visibility in challenging
+conditions and thus has a broad application in various tasks, for which both
+the accuracy and computational cost are of paramount importance. Most existing
+approaches treat RGB and infrared modalities equally, typically adopting two
+symmetrical CNN backbones for multimodal feature extraction, which ignores the
+substantial differences between modalities and brings great difficulty for the
+reduction of the computational cost as well as effective crossmodal fusion. In
+this work, we propose a novel and efficient framework named WCCNet that is able
+to differentially extract rich features of different spectra with lower
+computational complexity and semantically rearranges these features for
+effective crossmodal fusion. Specifically, the discrete wavelet transform (DWT)
+allowing fast inference and training speed is embedded to construct a
+dual-stream backbone for efficient feature extraction. The DWT layers of WCCNet
+extract frequency components for infrared modality, while the CNN layers
+extract spatial-domain features for RGB modality. This methodology not only
+significantly reduces the computational complexity, but also improves the
+extraction of infrared features to facilitate the subsequent crossmodal fusion.
+Based on the well extracted features, we elaborately design the crossmodal
+rearranging fusion module (CMRF), which can mitigate spatial misalignment and
+merge semantically complementary features of spatially-related local regions to
+amplify the crossmodal complementary information. We conduct comprehensive
+evaluations on KAIST and FLIR benchmarks, in which WCCNet outperforms
+state-of-the-art methods with considerable computational efficiency and
+competitive accuracy. We also perform the ablation study and analyze thoroughly
+the impact of different components on the performance of WCCNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TS-RGBD <span class="highlight-title">Dataset</span>: a Novel <span class="highlight-title">Dataset</span> for Theatre Scenes Description for
+  People with Visual Impairments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leyla Benhamida, Khadidja Delloul, Slimane Larabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer vision was long a tool used for aiding visually impaired people to
+move around their environment and avoid obstacles and falls. Solutions are
+limited to either indoor or outdoor scenes, which limits the kind of places and
+scenes visually disabled people can be in, including entertainment places such
+as theatres. Furthermore, most of the proposed computer-vision-based methods
+rely on RGB benchmarks to train their models resulting in a limited performance
+due to the absence of the depth modality.
+  In this paper, we propose a novel RGB-D dataset containing theatre scenes
+with ground truth human actions and dense captions annotations for image
+captioning and human action recognition: TS-RGBD dataset. It includes three
+types of data: RGB, depth, and skeleton sequences, captured by Microsoft
+Kinect.
+  We test image captioning models on our dataset as well as some skeleton-based
+human action recognition models in order to extend the range of environment
+types where a visually disabled person can be, by detecting human actions and
+textually describing appearances of regions of interest in theatre scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Three Factors to Improve Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunjun Choi, JaeHo Chung, Hawook Jeong, Jin Young Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the problem of out-of-distribution (OOD) detection, the usage of auxiliary
+data as outlier data for fine-tuning has demonstrated encouraging performance.
+However, previous methods have suffered from a trade-off between classification
+accuracy (ACC) and OOD detection performance (AUROC, FPR, AUPR). To improve
+this trade-off, we make three contributions: (i) Incorporating a self-knowledge
+distillation loss can enhance the accuracy of the network; (ii) Sampling
+semi-hard outlier data for training can improve OOD detection performance with
+minimal impact on accuracy; (iii) The introduction of our novel supervised
+contrastive learning can simultaneously improve OOD detection performance and
+the accuracy of the network. By incorporating all three factors, our approach
+enhances both accuracy and OOD detection performance by addressing the
+trade-off between classification and OOD detection. Our method achieves
+improvements over previous approaches in both performance metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point Anywhere: Directed Object Estimation from Omnidirectional Images <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nanami Kotani, Asako Kanezaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the intuitive instruction methods in robot navigation is a pointing
+gesture. In this study, we propose a method using an omnidirectional camera to
+eliminate the user/object position constraint and the left/right constraint of
+the pointing arm. Although the accuracy of skeleton and object detection is low
+due to the high distortion of equirectangular images, the proposed method
+enables highly accurate estimation by repeatedly extracting regions of interest
+from the equirectangular image and projecting them onto perspective images.
+Furthermore, we found that training the likelihood of the target object in
+machine learning further improves the estimation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGGRAPH 2023 Poster. Project page:
+  https://github.com/NKotani/PointAnywhere</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengju Ye, Wei Jing, Chunyong Hu, Shikun Huang, Lingping Gao, Fangzhen Li, Jingke Wang, Ke Guo, Wencong Xiao, Weibo Mao, Hang Zheng, Kun Li, Junbo Chen, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building a multi-modality multi-task neural network toward accurate and
+robust performance is a de-facto standard in perception task of autonomous
+driving. However, leveraging such data from multiple sensors to jointly
+optimize the prediction and planning tasks remains largely unexplored. In this
+paper, we present FusionAD, to the best of our knowledge, the first unified
+framework that fuse the information from two most critical sensors, camera and
+LiDAR, goes beyond perception task. Concretely, we first build a transformer
+based multi-modality fusion network to effectively produce fusion based
+features. In constrast to camera-based end-to-end method UniAD, we then
+establish a fusion aided modality-aware prediction and status-aware planning
+modules, dubbed FMSPnP that take advantages of multi-modality features. We
+conduct extensive experiments on commonly used benchmark nuScenes dataset, our
+FusionAD achieves state-of-the-art performance and surpassing baselines on
+average 15% on perception tasks like detection and tracking, 10% on occupancy
+prediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score
+and reduces the collision rate from 0.31% to only 0.12%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MDT3D: Multi-<span class="highlight-title">Dataset</span> Training for LiDAR 3D Object Detection
+  Generalization <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Soum-Fontez, Jean-Emmanuel Deschaud, François Goulette
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised 3D Object Detection models have been displaying increasingly
+better performance in single-domain cases where the training data comes from
+the same environment and sensor as the testing data. However, in real-world
+scenarios data from the target domain may not be available for finetuning or
+for domain adaptation methods. Indeed, 3D object detection models trained on a
+source dataset with a specific point distribution have shown difficulties in
+generalizing to unseen datasets. Therefore, we decided to leverage the
+information available from several annotated source datasets with our
+Multi-Dataset Training for 3D Object Detection (MDT3D) method to increase the
+robustness of 3D object detection models when tested in a new environment with
+a different sensor configuration. To tackle the labelling gap between datasets,
+we used a new label mapping based on coarse labels. Furthermore, we show how we
+managed the mix of datasets during training and finally introduce a new
+cross-dataset augmentation method: cross-dataset object injection. We
+demonstrate that this training paradigm shows improvements for different types
+of 3D object detection models. The source code and additional results for this
+research project will be publicly available on GitHub for interested parties to
+access and utilize: https://github.com/LouisSF/MDT3D
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Synthetic Data for Data Imbalance Problems: Baselines from a
+  Data Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moon Ye-Bin, Nam Hyeon-Woo, Wonseok Choi, Nayeong Kim, Suha Kwak, Tae-Hyun Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We live in a vast ocean of data, and deep neural networks are no exception to
+this. However, this data exhibits an inherent phenomenon of imbalance. This
+imbalance poses a risk of deep neural networks producing biased predictions,
+leading to potentially severe ethical and social consequences. To address these
+challenges, we believe that the use of generative models is a promising
+approach for comprehending tasks, given the remarkable advancements
+demonstrated by recent diffusion models in generating high-quality images. In
+this work, we propose a simple yet effective baseline, SYNAuG, that utilizes
+synthetic data as a preliminary step before employing task-specific algorithms
+to address data imbalance problems. This straightforward approach yields
+impressive performance on datasets such as CIFAR100-LT, ImageNet100-LT,
+UTKFace, and Waterbird, surpassing the performance of existing task-specific
+methods. While we do not claim that our approach serves as a complete solution
+to the problem of data imbalance, we argue that supplementing the existing data
+with synthetic data proves to be an effective and crucial preliminary step in
+addressing data imbalance concerns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orientation-Guided Contrastive Learning for UAV-View Geo-Localisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00982v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00982v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Deuser, Konrad Habel, Martin Werner, Norbert Oswald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieving relevant multimedia content is one of the main problems in a world
+that is increasingly data-driven. With the proliferation of drones, high
+quality aerial footage is now available to a wide audience for the first time.
+Integrating this footage into applications can enable GPS-less geo-localisation
+or location correction.
+  In this paper, we present an orientation-guided training framework for
+UAV-view geo-localisation. Through hierarchical localisation orientations of
+the UAV images are estimated in relation to the satellite imagery. We propose a
+lightweight prediction module for these pseudo labels which predicts the
+orientation between the different views based on the contrastive learned
+embeddings. We experimentally demonstrate that this prediction supports the
+training and outperforms previous approaches. The extracted pseudo-labels also
+enable aligned rotation of the satellite image as augmentation to further
+strengthen the generalisation. During inference, we no longer need this
+orientation module, which means that no additional computations are required.
+We achieve state-of-the-art results on both the University-1652 and
+University-160k datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ForensicsForest Family: A Series of Multi-scale Hierarchical Cascade
+  Forests for Detecting GAN-generated Faces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiucui Lu, Yuezun Li, Jiaran Zhou, Bin Li, Junyu Dong, Siwei Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prominent progress in generative models has significantly improved the
+reality of generated faces, bringing serious concerns to society. Since recent
+GAN-generated faces are in high realism, the forgery traces have become more
+imperceptible, increasing the forensics challenge. To combat GAN-generated
+faces, many countermeasures based on Convolutional Neural Networks (CNNs) have
+been spawned due to their strong learning ability. In this paper, we rethink
+this problem and explore a new approach based on forest models instead of CNNs.
+Specifically, we describe a simple and effective forest-based method set called
+{\em ForensicsForest Family} to detect GAN-generate faces. The proposed
+ForensicsForest family is composed of three variants, which are {\em
+ForensicsForest}, {\em Hybrid ForensicsForest} and {\em Divide-and-Conquer
+ForensicsForest} respectively. ForenscisForest is a newly proposed Multi-scale
+Hierarchical Cascade Forest, which takes semantic, frequency and biology
+features as input, hierarchically cascades different levels of features for
+authenticity prediction, and then employs a multi-scale ensemble scheme that
+can comprehensively consider different levels of information to improve the
+performance further. Based on ForensicsForest, we develop Hybrid
+ForensicsForest, an extended version that integrates the CNN layers into
+models, to further refine the effectiveness of augmented features. Moreover, to
+reduce the memory cost in training, we propose Divide-and-Conquer
+ForensicsForest, which can construct a forest model using only a portion of
+training samplings. In the training stage, we train several candidate forest
+models using the subsets of training samples. Then a ForensicsForest is
+assembled by picking the suitable components from these candidate forest
+models...
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curriculum Guided Domain Adaptation in the Dark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chowdhury Sadman Jahan, Andreas Savakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing the rising concerns of privacy and security, domain adaptation in
+the dark aims to adapt a black-box source trained model to an unlabeled target
+domain without access to any source data or source model parameters. The need
+for domain adaptation of black-box predictors becomes even more pronounced to
+protect intellectual property as deep learning based solutions are becoming
+increasingly commercialized. Current methods distill noisy predictions on the
+target data obtained from the source model to the target model, and/or separate
+clean/noisy target samples before adapting using traditional noisy label
+learning algorithms. However, these methods do not utilize the easy-to-hard
+learning nature of the clean/noisy data splits. Also, none of the existing
+methods are end-to-end, and require a separate fine-tuning stage and an initial
+warmup stage. In this work, we present Curriculum Adaptation for Black-Box
+(CABB) which provides a curriculum guided adaptation approach to gradually
+train the target model, first on target data with high confidence (clean)
+labels, and later on target data with noisy labels. CABB utilizes
+Jensen-Shannon divergence as a better criterion for clean-noisy sample
+separation, compared to the traditional criterion of cross entropy loss. Our
+method utilizes co-training of a dual-branch network to suppress error
+accumulation resulting from confirmation bias. The proposed approach is
+end-to-end trainable and does not require any extra finetuning stage, unlike
+existing methods. Empirical results on standard domain adaptation datasets show
+that CABB outperforms existing state-of-the-art black-box DA models and is
+comparable to white-box domain adaptation models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Sparse to Soft Mixtures of Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joan Puigcerver, Carlos Riquelme, Basil Mustafa, Neil Houlsby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse mixture of expert architectures (MoEs) scale model capacity without
+large increases in training or inference costs. Despite their success, MoEs
+suffer from a number of issues: training instability, token dropping, inability
+to scale the number of experts, or ineffective finetuning. In this work, we
+proposeSoft MoE, a fully-differentiable sparse Transformer that addresses these
+challenges, while maintaining the benefits of MoEs. Soft MoE performs an
+implicit soft assignment by passing different weighted combinations of all
+input tokens to each expert. As in other MoE works, experts in Soft MoE only
+process a subset of the (combined) tokens, enabling larger model capacity at
+lower inference cost. In the context of visual recognition, Soft MoE greatly
+outperforms standard Transformers (ViTs) and popular MoE variants (Tokens
+Choice and Experts Choice). For example, Soft MoE-Base/16 requires 10.5x lower
+inference cost (5.7x lower wall-clock time) than ViT-Huge/14 while matching its
+performance after similar training. Soft MoE also scales well: Soft MoE Huge/14
+with 128 experts in 16 MoE layers has over 40x more parameters than ViT
+Huge/14, while inference time cost grows by only 2%, and it performs
+substantially better.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-Free Instance Segmentation from Semantic Image Segmentation
+  Masks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Shen, Dong Zhang, Yuhui Zheng, Zechao Li, Liyong Fu, Qiaolin Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the development of instance segmentation has garnered
+significant attention in a wide range of applications. However, the training of
+a fully-supervised instance segmentation model requires costly both
+instance-level and pixel-level annotations. In contrast, weakly-supervised
+instance segmentation methods (i.e., with image-level class labels or point
+labels) struggle to satisfy the accuracy and recall requirements of practical
+scenarios. In this paper, we propose a novel paradigm for instance segmentation
+called training-free instance segmentation (TFISeg), which achieves instance
+segmentation results from image masks predicted using off-the-shelf semantic
+segmentation models. TFISeg does not require training a semantic or/and
+instance segmentation model and avoids the need for instance-level image
+annotations. Therefore, it is highly efficient. Specifically, we first obtain a
+semantic segmentation mask of the input image via a trained semantic
+segmentation model. Then, we calculate a displacement field vector for each
+pixel based on the segmentation mask, which can indicate representations
+belonging to the same class but different instances, i.e., obtaining the
+instance-level object information. Finally, instance segmentation results are
+obtained after being refined by a learnable category-agnostic object boundary
+branch. Extensive experimental results on two challenging datasets and
+representative semantic segmentation baselines (including CNNs and
+Transformers) demonstrate that TFISeg can achieve competitive results compared
+to the state-of-the-art fully-supervised instance segmentation methods without
+the need for additional human resources or increased computational costs. The
+code is available at: TFISeg
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposing and Coupling Saliency Map for Lesion Segmentation in
+  Ultrasound Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyuan Ning, Yixiao Mao, Qianjin Feng, Shengzhou Zhong, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex scenario of ultrasound image, in which adjacent tissues (i.e.,
+background) share similar intensity with and even contain richer texture
+patterns than lesion region (i.e., foreground), brings a unique challenge for
+accurate lesion segmentation. This work presents a decomposition-coupling
+network, called DC-Net, to deal with this challenge in a
+(foreground-background) saliency map disentanglement-fusion manner. The DC-Net
+consists of decomposition and coupling subnets, and the former preliminarily
+disentangles original image into foreground and background saliency maps,
+followed by the latter for accurate segmentation under the assistance of
+saliency prior fusion. The coupling subnet involves three aspects of fusion
+strategies, including: 1) regional feature aggregation (via differentiable
+context pooling operator in the encoder) to adaptively preserve local
+contextual details with the larger receptive field during dimension reduction;
+2) relation-aware representation fusion (via cross-correlation fusion module in
+the decoder) to efficiently fuse low-level visual characteristics and
+high-level semantic features during resolution restoration; 3) dependency-aware
+prior incorporation (via coupler) to reinforce foreground-salient
+representation with the complementary information derived from background
+representation. Furthermore, a harmonic loss function is introduced to
+encourage the network to focus more attention on low-confidence and hard
+samples. The proposed method is evaluated on two ultrasound lesion segmentation
+tasks, which demonstrates the remarkable performance improvement over existing
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WaterFlow: Heuristic Normalizing Flow for Underwater Image Enhancement
+  and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengxi Zhang, Zhiying Jiang, Jinyuan Liu, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater images suffer from light refraction and absorption, which impairs
+visibility and interferes the subsequent applications. Existing underwater
+image enhancement methods mainly focus on image quality improvement, ignoring
+the effect on practice. To balance the visual quality and application, we
+propose a heuristic normalizing flow for detection-driven underwater image
+enhancement, dubbed WaterFlow. Specifically, we first develop an invertible
+mapping to achieve the translation between the degraded image and its clear
+counterpart. Considering the differentiability and interpretability, we
+incorporate the heuristic prior into the data-driven mapping procedure, where
+the ambient light and medium transmission coefficient benefit credible
+generation. Furthermore, we introduce a detection perception module to transmit
+the implicit semantic guidance into the enhancement procedure, where the
+enhanced images hold more detection-favorable features and are able to promote
+the detection performance. Extensive experiments prove the superiority of our
+WaterFlow, against state-of-the-art methods quantitatively and qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Discriminative Representation with Meta-learning for
+  Colonoscopic Polyp Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suncheng Xiang, Qingzhong Chen, Shilun Cai, Chengfeng Zhou, Crystal Cai, Sijia Du, Dahong Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Colonoscopic Polyp Re-Identification aims to match the same polyp from a
+large gallery with images from different views taken using different cameras
+and plays an important role in the prevention and treatment of colorectal
+cancer in computer-aided diagnosis. However, traditional methods for object
+ReID directly adopting CNN models trained on the ImageNet dataset usually
+produce unsatisfactory retrieval performance on colonoscopic datasets due to
+the large domain gap. Additionally, these methods neglect to explore the
+potential of self-discrepancy among intra-class relations in the colonoscopic
+polyp dataset, which remains an open research problem in the medical community.
+To solve this dilemma, we propose a simple but effective training method named
+Colo-ReID, which can help our model to learn more general and discriminative
+knowledge based on the meta-learning strategy in scenarios with fewer samples.
+Based on this, a dynamic Meta-Learning Regulation mechanism called MLR is
+introduced to further boost the performance of polyp re-identification. To the
+best of our knowledge, this is the first attempt to leverage the meta-learning
+paradigm instead of traditional machine learning to effectively train deep
+models in the task of colonoscopic polyp re-identification. Empirical results
+show that our method significantly outperforms current state-of-the-art methods
+by a clear margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2307.10625</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection and Segmentation of Cosmic Objects Based on Adaptive
+  Thresholding and Back Propagation Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samia Sultana, Shyla Afroge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Astronomical images provide information about the great variety of cosmic
+objects in the Universe. Due to the large volumes of data, the presence of
+innumerable bright point sources as well as noise within the frame and the
+spatial gap between objects and satellite cameras, it is a challenging task to
+classify and detect the celestial objects. We propose an Adaptive Thresholding
+Method (ATM) based segmentation and Back Propagation Neural Network (BPNN)
+based cosmic object detection including a well-structured series of
+pre-processing steps designed to enhance segmentation and detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 7 figures, Bachelor Thesis, Rajshahi University of
+  Engineering and Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Domain Adaptation on Aerial Images under Gradually Degrading
+  Weather 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chowdhury Sadman Jahan, Andreas Savakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation (DA) strives to mitigate the domain gap between the source
+domain where a model is trained, and the target domain where the model is
+deployed. When a deep learning model is deployed on an aerial platform, it may
+face gradually degrading weather conditions during operation, leading to
+widening domain gaps between the training data and the encountered evaluation
+data. We synthesize two such gradually worsening weather conditions on real
+images from two existing aerial imagery datasets, generating a total of four
+benchmark datasets. Under the continual, or test-time adaptation setting, we
+evaluate three DA models on our datasets: a baseline standard DA model and two
+continual DA models. In such setting, the models can access only one small
+portion, or one batch of the target data at a time, and adaptation takes place
+continually, and over only one epoch of the data. The combination of the
+constraints of continual adaptation, and gradually deteriorating weather
+conditions provide the practical DA scenario for aerial deployment. Among the
+evaluated models, we consider both convolutional and transformer architectures
+for comparison. We discover stability issues during adaptation for existing
+buffer-fed continual DA methods, and offer gradient normalization as a simple
+solution to curb training instability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual histological staining of unlabeled autopsy tissue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhu Li, Nir Pillar, Jingxi Li, Tairan Liu, Di Wu, Songyu Sun, Guangdong Ma, Kevin de Haan, Luzhe Huang, Sepehr Hamidi, Anatoly Urisman, Tal Keidar Haran, William Dean Wallace, Jonathan E. Zuckerman, Aydogan Ozcan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histological examination is a crucial step in an autopsy; however, the
+traditional histochemical staining of post-mortem samples faces multiple
+challenges, including the inferior staining quality due to autolysis caused by
+delayed fixation of cadaver tissue, as well as the resource-intensive nature of
+chemical staining procedures covering large tissue areas, which demand
+substantial labor, cost, and time. These challenges can become more pronounced
+during global health crises when the availability of histopathology services is
+limited, resulting in further delays in tissue fixation and more severe
+staining artifacts. Here, we report the first demonstration of virtual staining
+of autopsy tissue and show that a trained neural network can rapidly transform
+autofluorescence images of label-free autopsy tissue sections into brightfield
+equivalent images that match hematoxylin and eosin (H&E) stained versions of
+the same samples, eliminating autolysis-induced severe staining artifacts
+inherent in traditional histochemical staining of autopsied tissue. Our virtual
+H&E model was trained using >0.7 TB of image data and a data-efficient
+collaboration scheme that integrates the virtual staining network with an image
+registration network. The trained model effectively accentuated nuclear,
+cytoplasmic and extracellular features in new autopsy tissue samples that
+experienced severe autolysis, such as COVID-19 samples never seen before, where
+the traditional histochemical staining failed to provide consistent staining
+quality. This virtual autopsy staining technique can also be extended to
+necrotic tissue, and can rapidly and cost-effectively generate artifact-free
+H&E stains despite severe autolysis and cell death, also reducing labor, cost
+and infrastructure requirements associated with the standard histochemical
+staining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 Pages, 7 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Cross-Perturbation for Single Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongjia Zhao, Lei Qi, Xiao Shi, Yinghuan Shi, Xin Geng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single domain generalization aims to enhance the ability of the model to
+generalize to unknown domains when trained on a single source domain. However,
+the limited diversity in the training data hampers the learning of
+domain-invariant features, resulting in compromised generalization performance.
+To address this, data perturbation (augmentation) has emerged as a crucial
+method to increase data diversity. Nevertheless, existing perturbation methods
+often focus on either image-level or feature-level perturbations independently,
+neglecting their synergistic effects. To overcome these limitations, we propose
+CPerb, a simple yet effective cross-perturbation method. Specifically, CPerb
+utilizes both horizontal and vertical operations. Horizontally, it applies
+image-level and feature-level perturbations to enhance the diversity of the
+training data, mitigating the issue of limited diversity in single-source
+domains. Vertically, it introduces multi-route perturbation to learn
+domain-invariant features from different perspectives of samples with the same
+semantic category, thereby enhancing the generalization capability of the
+model. Additionally, we propose MixPatch, a novel feature-level perturbation
+method that exploits local image style information to further diversify the
+training data. Extensive experiments on various benchmark datasets validate the
+effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ImageBrush: Learning Visual In-Context Instructions for Exemplar-Based
+  Image Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasheng Sun, Yifan Yang, Houwen Peng, Yifei Shen, Yuqing Yang, Han Hu, Lili Qiu, Hideki Koike
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While language-guided image manipulation has made remarkable progress, the
+challenge of how to instruct the manipulation process faithfully reflecting
+human intentions persists. An accurate and comprehensive description of a
+manipulation task using natural language is laborious and sometimes even
+impossible, primarily due to the inherent uncertainty and ambiguity present in
+linguistic expressions. Is it feasible to accomplish image manipulation without
+resorting to external cross-modal language information? If this possibility
+exists, the inherent modality gap would be effortlessly eliminated. In this
+paper, we propose a novel manipulation methodology, dubbed ImageBrush, that
+learns visual instructions for more accurate image editing. Our key idea is to
+employ a pair of transformation images as visual instructions, which not only
+precisely captures human intention but also facilitates accessibility in
+real-world scenarios. Capturing visual instructions is particularly challenging
+because it involves extracting the underlying intentions solely from visual
+demonstrations and then applying this operation to a new image. To address this
+challenge, we formulate visual instruction learning as a diffusion-based
+inpainting problem, where the contextual information is fully exploited through
+an iterative process of generation. A visual prompting encoder is carefully
+devised to enhance the model's capacity in uncovering human intent behind the
+visual instructions. Extensive experiments show that our method generates
+engaging manipulation results conforming to the transformations entailed in
+demonstrations. Moreover, our model exhibits robust generalization capabilities
+on various downstream tasks such as pose transfer, image translation and video
+inpainting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HANDAL: A <span class="highlight-title">Dataset</span> of Real-World Manipulable Object Categories with Pose
+  Annotations, Affordances, and Reconstructions <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Guo, Bowen Wen, Jianhe Yuan, Jonathan Tremblay, Stephen Tyree, Jeffrey Smith, Stan Birchfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the HANDAL dataset for category-level object pose estimation and
+affordance prediction. Unlike previous datasets, ours is focused on
+robotics-ready manipulable objects that are of the proper size and shape for
+functional grasping by robot manipulators, such as pliers, utensils, and
+screwdrivers. Our annotation process is streamlined, requiring only a single
+off-the-shelf camera and semi-automated processing, allowing us to produce
+high-quality 3D annotations without crowd-sourcing. The dataset consists of
+308k annotated image frames from 2.2k videos of 212 real-world objects in 17
+categories. We focus on hardware and kitchen tool objects to facilitate
+research in practical scenarios in which a robot manipulator needs to interact
+with the environment beyond simple pushing or indiscriminate grasping. We
+outline the usefulness of our dataset for 6-DoF category-level pose+scale
+estimation and related tasks. We also provide 3D reconstructed meshes of all
+objects, and we outline some of the bottlenecks to be addressed for
+democratizing the collection of datasets like this one.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2023. Project page: https://nvlabs.github.io/HANDAL/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reverse Stable Diffusion: What <span class="highlight-title">prompt</span> was used to generate this image? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models such as Stable Diffusion have recently
+attracted the interest of many researchers, and inverting the diffusion process
+can play an important role in better understanding the generative process and
+how to engineer prompts in order to obtain the desired images. To this end, we
+introduce the new task of predicting the text prompt given an image generated
+by a generative diffusion model. We combine a series of white-box and black-box
+models (with and without access to the weights of the diffusion network) to
+deal with the proposed task. We propose a novel learning framework comprising
+of a joint prompt regression and multi-label vocabulary classification
+objective that generates improved prompts. To further improve our method, we
+employ a curriculum learning procedure that promotes the learning of
+image-prompt pairs with lower labeling noise (i.e. that are better aligned),
+and an unsupervised domain-adaptive kernel learning method that uses the
+similarities between samples in the source and target domains as extra
+features. We conduct experiments on the DiffusionDB data set, predicting text
+prompts from images generated by Stable Diffusion. Our novel learning framework
+produces excellent results on the aforementioned task, yielding the highest
+gains when applied on the white-box model. In addition, we make an interesting
+discovery: training a diffusion model on the prompt generation task can make
+the model generate images that are much better aligned with the input prompts,
+when the model is directly reused for text-to-image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Occupancy Flow Fields for Perception and Prediction in
+  Self-Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Agro, Quinlan Sykora, Sergio Casas, Raquel Urtasun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A self-driving vehicle (SDV) must be able to perceive its surroundings and
+predict the future behavior of other traffic participants. Existing works
+either perform object detection followed by trajectory forecasting of the
+detected objects, or predict dense occupancy and flow grids for the whole
+scene. The former poses a safety concern as the number of detections needs to
+be kept low for efficiency reasons, sacrificing object recall. The latter is
+computationally expensive due to the high-dimensionality of the output grid,
+and suffers from the limited receptive field inherent to fully convolutional
+networks. Furthermore, both approaches employ many computational resources
+predicting areas or objects that might never be queried by the motion planner.
+This motivates our unified approach to perception and future prediction that
+implicitly represents occupancy and flow over time with a single neural
+network. Our method avoids unnecessary computation, as it can be directly
+queried by the motion planner at continuous spatio-temporal locations.
+Moreover, we design an architecture that overcomes the limited receptive field
+of previous explicit occupancy prediction methods by adding an efficient yet
+effective global attention mechanism. Through extensive experiments in both
+urban and highway settings, we demonstrate that our implicit model outperforms
+the current state-of-the-art. For more information, visit the project website:
+https://waabi.ai/research/implicito.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COVID-VR: A Deep Learning COVID-19 Classification Model Using
+  Volume-Rendered Computer Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noemi Maritza L. Romero, Ricco Vasconcellos, Mariana R. Mendoza, João L. D. Comba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic presented numerous challenges to healthcare systems
+worldwide. Given that lung infections are prevalent among COVID-19 patients,
+chest Computer Tomography (CT) scans have frequently been utilized as an
+alternative method for identifying COVID-19 conditions and various other types
+of pulmonary diseases. Deep learning architectures have emerged to automate the
+identification of pulmonary disease types by leveraging CT scan slices as
+inputs for classification models. This paper introduces COVID-VR, a novel
+approach for classifying pulmonary diseases based on volume rendering images of
+the lungs captured from multiple angles, thereby providing a comprehensive view
+of the entire lung in each image. To assess the effectiveness of our proposal,
+we compared it against competing strategies utilizing both private data
+obtained from partner hospitals and a publicly available dataset. The results
+demonstrate that our approach effectively identifies pulmonary lesions and
+performs competitively when compared to slice-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiDAR View Synthesis for Robust Vehicle Navigation Without Expert Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Schmidt, Qadeer Khan, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models for self-driving cars require a diverse training dataset
+to safely manage critical driving scenarios on public roads. This includes
+having data from divergent trajectories such as the oncoming traffic lane or
+sidewalks. Such data would be too dangerous to collect in the real world. Data
+augmentation approaches have been proposed to tackle this issue using RGB
+images. However, solutions based on LiDAR sensors are scarce. We therefore
+propose an approach to synthesize additional LiDAR point clouds from novel
+viewpoints without having the need to physically drive at dangerous positions.
+The LiDAR view synthesis is done using mesh reconstruction and ray casting. We
+train a deep learning model, which takes a LiDAR scan as input and predicts the
+future trajectory as output. A waypoint controller is then applied on this
+predicted trajectory to determine the throttle and steering labels of the
+ego-vehicle. Our method neither requires expert driving labels for the original
+nor for the synthesized LiDAR sequence. Instead, we infer labels from LiDAR
+odometry. We demonstrate the effectiveness of our approach in a comprehensive
+online evaluation and with a comparison to concurrent work. Our results show
+the importance of synthesizing additional LiDAR point clouds, particularly in
+terms of model robustness. Code and supplementary visualizations are available
+at https://jonathsch.github.io/lidar-synthesis/ .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harder synthetic anomalies to improve OoD detection in Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Naval Marimont, Giacomo Tarroni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our method builds upon previous Medical Out-of-Distribution (MOOD) challenge
+winners that empirically show that synthetic local anomalies generated copying
+/ interpolating foreign patches are useful to train segmentation networks able
+to generalize to unseen types of anomalies. In terms of the synthetic anomaly
+generation process, our contributions makes synthetic anomalies more
+heterogeneous and challenging by 1) using random shapes instead of squares and
+2) smoothing the interpolation edge of anomalies so networks cannot rely on the
+high gradient between image - foreign patch to identify anomalies. Our
+experiments using the validation set of 2020 MOOD winners show that both
+contributions improved substantially the method performance. We used a standard
+3D U-Net architecture as segmentation network, trained patch-wise in both brain
+and abdominal datasets. Our final challenge submission consisted of 10 U-Nets
+trained across 5 data folds with different configurations of the anomaly
+generation process. Our method achieved first position in both sample-wise and
+pixel-wise tasks in the 2022 edition of the Medical Out-of-Distribution held at
+MICCAI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenFlamingo: An Open-Source Framework for Training Large Autoregressive
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OpenFlamingo, a family of autoregressive vision-language models
+ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce
+an open-source replication of DeepMind's Flamingo models. On seven
+vision-language datasets, OpenFlamingo models average between 80 - 89% of
+corresponding Flamingo performance. This technical report describes our models,
+training data, hyperparameters, and evaluation suite. We share our models and
+code at https://github.com/mlfoundations/open_flamingo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Follow the Soldiers with Optimized Single-Shot Multibox Detection and
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jumman Hossain, Maliha Momtaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, autonomous cars are gaining traction due to their numerous
+potential applications on battlefields and in resolving a variety of other
+real-world challenges. The main goal of our project is to build an autonomous
+system using DeepRacer which will follow a specific person (for our project, a
+soldier) when they will be moving in any direction. Two main components to
+accomplish this project is an optimized Single-Shot Multibox Detection (SSD)
+object detection model and a Reinforcement Learning (RL) model. We accomplished
+the task using SSD Lite instead of SSD and at the end, compared the results
+among SSD, SSD with Neural Computing Stick (NCS), and SSD Lite. Experimental
+results show that SSD Lite gives better performance among these three
+techniques and exhibits a considerable boost in inference speed (~2-3 times)
+without compromising accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational Long Exposure Mobile Photography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Tabellion, Nikhil Karnad, Noa Glaser, Ben Weiss, David E. Jacobs, Yael Pritch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long exposure photography produces stunning imagery, representing moving
+elements in a scene with motion-blur. It is generally employed in two
+modalities, producing either a foreground or a background blur effect.
+Foreground blur images are traditionally captured on a tripod-mounted camera
+and portray blurred moving foreground elements, such as silky water or light
+trails, over a perfectly sharp background landscape. Background blur images,
+also called panning photography, are captured while the camera is tracking a
+moving subject, to produce an image of a sharp subject over a background
+blurred by relative motion. Both techniques are notoriously challenging and
+require additional equipment and advanced skills. In this paper, we describe a
+computational burst photography system that operates in a hand-held smartphone
+camera app, and achieves these effects fully automatically, at the tap of the
+shutter button. Our approach first detects and segments the salient subject. We
+track the scene motion over multiple frames and align the images in order to
+preserve desired sharpness and to produce aesthetically pleasing motion
+streaks. We capture an under-exposed burst and select the subset of input
+frames that will produce blur trails of controlled length, regardless of scene
+or camera motion velocity. We predict inter-frame motion and synthesize
+motion-blur to fill the temporal gaps between the input frames. Finally, we
+composite the blurred image with the sharp regular exposure to protect the
+sharpness of faces or areas of the scene that are barely moving, and produce a
+final high resolution and high dynamic range (HDR) photograph. Our system
+democratizes a capability previously reserved to professionals, and makes this
+creative style accessible to most casual photographers.
+  More information and supplementary material can be found on our project
+webpage: https://motion-mode.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A vision <span class="highlight-title">transformer</span>-based framework for knowledge transfer from
+  multi-modal to mono-modal lymphoma subtyping models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bilel Guetarni, Feryal Windal, Halim Benhabiles, Marianne Petit, Romain Dubois, Emmanuelle Leteurtre, Dominique Collard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Determining lymphoma subtypes is a crucial step for better patients treatment
+targeting to potentially increase their survival chances. In this context, the
+existing gold standard diagnosis method, which is based on gene expression
+technology, is highly expensive and time-consuming making difficult its
+accessibility. Although alternative diagnosis methods based on IHC
+(immunohistochemistry) technologies exist (recommended by the WHO), they still
+suffer from similar limitations and are less accurate. WSI (Whole Slide Image)
+analysis by deep learning models showed promising new directions for cancer
+diagnosis that would be cheaper and faster than existing alternative methods.
+In this work, we propose a vision transformer-based framework for
+distinguishing DLBCL (Diffuse Large B-Cell Lymphoma) cancer subtypes from
+high-resolution WSIs. To this end, we propose a multi-modal architecture to
+train a classifier model from various WSI modalities. We then exploit this
+model through a knowledge distillation mechanism for efficiently driving the
+learning of a mono-modal classifier. Our experimental study conducted on a
+dataset of 157 patients shows the promising performance of our mono-modal
+classification model, outperforming six recent methods from the
+state-of-the-art dedicated for cancer classification. Moreover, the power-law
+curve, estimated on our experimental data, shows that our classification model
+requires a reasonable number of additional patients for its training to
+potentially reach identical diagnosis accuracy as IHC technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sampling binary sparse coding QUBO models using a spiking neuromorphic
+  processor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Henke, Elijah Pelofske, Georg Hahn, Garrett T. Kenyon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of computing a sparse binary representation of an
+image. To be precise, given an image and an overcomplete, non-orthonormal
+basis, we aim to find a sparse binary vector indicating the minimal set of
+basis vectors that when added together best reconstruct the given input. We
+formulate this problem with an $L_2$ loss on the reconstruction error, and an
+$L_0$ (or, equivalently, an $L_1$) loss on the binary vector enforcing
+sparsity. This yields a so-called Quadratic Unconstrained Binary Optimization
+(QUBO) problem, whose solution is generally NP-hard to find. The contribution
+of this work is twofold. First, the method of unsupervised and unnormalized
+dictionary feature learning for a desired sparsity level to best match the data
+is presented. Second, the binary sparse coding problem is then solved on the
+Loihi 1 neuromorphic chip by the use of stochastic networks of neurons to
+traverse the non-convex energy landscape. The solutions are benchmarked against
+the classical heuristic simulated annealing. We demonstrate neuromorphic
+computing is suitable for sampling low energy solutions of binary sparse coding
+QUBO models, and although Loihi 1 is capable of sampling very sparse solutions
+of the QUBO models, there needs to be improvement in the implementation in
+order to be competitive with simulated annealing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InFusion: Inject and Attention Fusion for Multi Concept Zero Shot Text
+  based Video Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00135v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00135v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anant Khandelwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large text-to-image diffusion models have achieved remarkable success in
+generating diverse high-quality images in alignment with text prompt used for
+editing the input image. But, when these models applied to video the main
+challenge is to ensure temporal consistency and coherence across frames. In
+this paper, we proposed InFusion, a framework for zero-shot text-based video
+editing leveraging large pre-trained image diffusion models. Our framework
+specifically supports editing of multiple concepts with the pixel level control
+over diverse concepts mentioned in the editing prompt. Specifically, we inject
+the difference of features obtained with source and edit prompt from U-Net
+residual blocks in decoder layers, this when combined with injected attention
+features make it feasible to query the source contents and scale edited
+concepts along with the injection of unedited parts. The editing is further
+controlled in fine-grained manner with mask extraction and attention fusion
+strategy which cuts the edited part from source and paste it into the denoising
+pipeline for editing prompt. Our framework is a low cost alternative of
+one-shot tuned models for editing since it does not require training. We
+demonstrated the complex concept editing with generalised image model (Stable
+Diffusion v1.5) using LoRA. Adaptation is compatible with all the existing
+image diffusion techniques. Extensive experimental results demonstrate the
+effectiveness over existing methods in rendering high-quality and temporally
+consistent videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, 1 Table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated wildlife image classification: An active learning tool for
+  ecological applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15823v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15823v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwig Bothmann, Lisa Wimmer, Omid Charrakh, Tobias Weber, Hendrik Edelhoff, Wibke Peters, Hien Nguyen, Caryl Benjamin, Annette Menzel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wildlife camera trap images are being used extensively to investigate animal
+abundance, habitat associations, and behavior, which is complicated by the fact
+that experts must first classify the images manually. Artificial intelligence
+systems can take over this task but usually need a large number of
+already-labeled training images to achieve sufficient performance. This
+requirement necessitates human expert labor and poses a particular challenge
+for projects with few cameras or short durations. We propose a label-efficient
+learning strategy that enables researchers with small or medium-sized image
+databases to leverage the potential of modern machine learning, thus freeing
+crucial resources for subsequent analyses.
+  Our methodological proposal is two-fold: (1) We improve current strategies of
+combining object detection and image classification by tuning the
+hyperparameters of both models. (2) We provide an active learning (AL) system
+that allows training deep learning models very efficiently in terms of required
+human-labeled training images. We supply a software package that enables
+researchers to use these methods directly and thereby ensure the broad
+applicability of the proposed framework in ecological practice.
+  We show that our tuning strategy improves predictive performance. We
+demonstrate how the AL pipeline reduces the amount of pre-labeled data needed
+to achieve a specific predictive performance and that it is especially valuable
+for improving out-of-sample predictive performance.
+  We conclude that the combination of tuning and AL increases predictive
+performance substantially. Furthermore, we argue that our work can broadly
+impact the community through the ready-to-use software package provided.
+Finally, the publication of our models tailored to European wildlife data
+enriches existing model bases mostly trained on data from Africa and North
+America.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evolutionary Augmentation Policy Optimization for <span class="highlight-title">Self-supervised</span>
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01584v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01584v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noah Barrett, Zahra Sadeghi, Stan Matwin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised Learning (SSL) is a machine learning algorithm for
+pretraining Deep Neural Networks (DNNs) without requiring manually labeled
+data. The central idea of this learning technique is based on an auxiliary
+stage aka pretext task in which labeled data are created automatically through
+data augmentation and exploited for pretraining the DNN. However, the effect of
+each pretext task is not well studied or compared in the literature. In this
+paper, we study the contribution of augmentation operators on the performance
+of self supervised learning algorithms in a constrained settings. We propose an
+evolutionary search method for optimization of data augmentation pipeline in
+pretext tasks and measure the impact of augmentation operators in several SOTA
+SSL algorithms. By encoding different combination of augmentation operators in
+chromosomes we seek the optimal augmentation policies through an evolutionary
+optimization mechanism. We further introduce methods for analyzing and
+explaining the performance of optimized SSL algorithms. Our results indicate
+that our proposed method can find solutions that outperform the accuracy of
+classification of SSL algorithms which confirms the influence of augmentation
+policy choice on the overall performance of SSL algorithms. We also compare
+optimal SSL solutions found by our evolutionary search mechanism and show the
+effect of batch size in the pretext task on two visual datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D-Aware Object Localization using Gaussian Implicit Occupancy Function <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02058v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02058v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Gaudillière, Leo Pauly, Arunkumar Rathinam, Albert Garcia Sanchez, Mohamed Adel Musallam, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To automatically localize a target object in an image is crucial for many
+computer vision applications. To represent the 2D object, ellipse labels have
+recently been identified as a promising alternative to axis-aligned bounding
+boxes. This paper further considers 3D-aware ellipse labels, \textit{i.e.},
+ellipses which are projections of a 3D ellipsoidal approximation of the object,
+for 2D target localization. Indeed, projected ellipses carry more geometric
+information about the object geometry and pose (3D awareness) than traditional
+3D-agnostic bounding box labels. Moreover, such a generic 3D ellipsoidal model
+allows for approximating known to coarsely known targets. We then propose to
+have a new look at ellipse regression and replace the discontinuous geometric
+ellipse parameters with the parameters of an implicit Gaussian distribution
+encoding object occupancy in the image. The models are trained to regress the
+values of this bivariate Gaussian distribution over the image pixels using a
+statistical loss function. We introduce a novel non-trainable differentiable
+layer, E-DSNT, to extract the distribution parameters. Also, we describe how to
+readily generate consistent 3D-aware Gaussian occupancy parameters using only
+coarse dimensions of the target and relative pose labels. We extend three
+existing spacecraft pose estimation datasets with 3D-aware Gaussian occupancy
+labels to validate our hypothesis. Labels and source code are publicly
+accessible here: https://cvi2.uni.lu/3d-aware-obj-loc/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures, IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Adversarial Defense in Remote Sensing Based on <span class="highlight-title">Pre-train</span>ed
+  Denoising Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weikang Yu, Yonghao Xu, Pedram Ghamisi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have achieved tremendous success in many remote
+sensing (RS) applications, in which DNNs are vulnerable to adversarial
+perturbations. Unfortunately, current adversarial defense approaches in RS
+studies usually suffer from performance fluctuation and unnecessary re-training
+costs due to the need for prior knowledge of the adversarial perturbations
+among RS data. To circumvent these challenges, we propose a universal
+adversarial defense approach in RS imagery (UAD-RS) using pre-trained diffusion
+models to defend the common DNNs against multiple unknown adversarial attacks.
+Specifically, the generative diffusion models are first pre-trained on
+different RS datasets to learn generalized representations in various data
+domains. After that, a universal adversarial purification framework is
+developed using the forward and reverse process of the pre-trained diffusion
+models to purify the perturbations from adversarial samples. Furthermore, an
+adaptive noise level selection (ANLS) mechanism is built to capture the optimal
+noise level of the diffusion model that can achieve the best purification
+results closest to the clean samples according to their Frechet Inception
+Distance (FID) in deep feature space. As a result, only a single pre-trained
+diffusion model is needed for the universal purification of adversarial samples
+on each dataset, which significantly alleviates the re-training efforts and
+maintains high performance without prior knowledge of the adversarial
+perturbations. Experiments on four heterogeneous RS datasets regarding scene
+classification and semantic segmentation verify that UAD-RS outperforms
+state-of-the-art adversarial purification approaches with a universal defense
+against seven commonly existing adversarial perturbations. Codes and the
+pre-trained models are available online (https://github.com/EricYu97/UAD-RS).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added the GitHub link to the abstract</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Hit-frame Detection for Badminton Match Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16000v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16000v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Hang Chien, Fang Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sports professionals constantly under pressure to perform at the highest
+level can benefit from sports analysis, which allows coaches and players to
+reduce manual efforts and systematically evaluate their performance using
+automated tools. This research aims to advance sports analysis in badminton,
+systematically detecting hit-frames automatically from match videos using
+modern deep learning techniques. The data included in hit-frames can
+subsequently be utilized to synthesize players' strokes and on-court movement,
+as well as for other downstream applications such as analyzing training tasks
+and competition strategy. The proposed approach in this study comprises several
+automated procedures like rally-wise video trimming, player and court keypoints
+detection, shuttlecock flying direction prediction, and hit-frame detection. In
+the study, we achieved 99% accuracy on shot angle recognition for video
+trimming, over 92% accuracy for applying player keypoints sequences on
+shuttlecock flying direction prediction, and reported the evaluation results of
+rally-wise video trimming and hit-frame detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TeleViT: Teleconnection-driven <span class="highlight-title">Transformer</span>s Improve Subseasonal to
+  Seasonal Wildfire Forecasting <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Prapas, Nikolaos Ioannis Bountos, Spyros Kondylatos, Dimitrios Michail, Gustau Camps-Valls, Ioannis Papoutsis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wildfires are increasingly exacerbated as a result of climate change,
+necessitating advanced proactive measures for effective mitigation. It is
+important to forecast wildfires weeks and months in advance to plan forest fuel
+management, resource procurement and allocation. To achieve such accurate
+long-term forecasts at a global scale, it is crucial to employ models that
+account for the Earth system's inherent spatio-temporal interactions, such as
+memory effects and teleconnections. We propose a teleconnection-driven vision
+transformer (TeleViT), capable of treating the Earth as one interconnected
+system, integrating fine-grained local-scale inputs with global-scale inputs,
+such as climate indices and coarse-grained global variables. Through
+comprehensive experimentation, we demonstrate the superiority of TeleViT in
+accurately predicting global burned area patterns for various forecasting
+windows, up to four months in advance. The gain is especially pronounced in
+larger forecasting windows, demonstrating the improved ability of deep learning
+models that exploit teleconnections to capture Earth system dynamics. Code
+available at https://github.com/Orion-Ai-Lab/TeleViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the ICCV 2023 workshop on Artificial Intelligence for
+  Humanitarian Assistance and Disaster Response</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Combinatorial <span class="highlight-title">Prompt</span>s for Universal Controllable Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06338v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06338v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Wang, Jun Xiao, Yueting Zhuang, Fei Gao, Jian Shao, Long Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable Image Captioning (CIC) -- generating natural language
+descriptions about images under the guidance of given control signals -- is one
+of the most promising directions towards next-generation captioning systems.
+Till now, various kinds of control signals for CIC have been proposed, ranging
+from content-related control to structure-related control. However, due to the
+format and target gaps of different control signals, all existing CIC works (or
+architectures) only focus on one certain control signal, and overlook the
+human-like combinatorial ability. By ``combinatorial", we mean that our humans
+can easily meet multiple needs (or constraints) simultaneously when generating
+descriptions. To this end, we propose a novel prompt-based framework for CIC by
+learning Combinatorial Prompts, dubbed as ComPro. Specifically, we directly
+utilize a pretrained language model GPT-2 as our language model, which can help
+to bridge the gap between different signal-specific CIC architectures. Then, we
+reformulate the CIC as a prompt-guide sentence generation problem, and propose
+a new lightweight prompt generation network to generate the combinatorial
+prompts for different kinds of control signals. For different control signals,
+we further design a new mask attention mechanism to realize the prompt-based
+CIC. Due to its simplicity, our ComPro can be further extended to more kinds of
+combined control signals by concatenating these prompts. Extensive experiments
+on two prevalent CIC benchmarks have verified the effectiveness and efficiency
+of our ComPro on both single and combined control signals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COCO-O: A Benchmark for Object Detectors under Natural Distribution
+  Shifts <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Mao, Yuefeng Chen, Yao Zhu, Da Chen, Hang Su, Rong Zhang, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Practical object detection application can lose its effectiveness on image
+inputs with natural distribution shifts. This problem leads the research
+community to pay more attention on the robustness of detectors under
+Out-Of-Distribution (OOD) inputs. Existing works construct datasets to
+benchmark the detector's OOD robustness for a specific application scenario,
+e.g., Autonomous Driving. However, these datasets lack universality and are
+hard to benchmark general detectors built on common tasks such as COCO. To give
+a more comprehensive robustness assessment, we introduce
+COCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of
+natural distribution shifts. COCO-O has a large distribution gap with training
+data and results in a significant 55.7% relative performance drop on a Faster
+R-CNN detector. We leverage COCO-O to conduct experiments on more than 100
+modern object detectors to investigate if their improvements are credible or
+just over-fitting to the COCO test set. Unfortunately, most classic detectors
+in early years do not exhibit strong OOD generalization. We further study the
+robustness effect on recent breakthroughs of detector's architecture design,
+augmentation and pre-training techniques. Some empirical findings are revealed:
+1) Compared with detection head or neck, backbone is the most important part
+for robustness; 2) An end-to-end detection transformer design brings no
+enhancement, and may even reduce robustness; 3) Large-scale foundation models
+have made a great leap on robust object detection. We hope our COCO-O could
+provide a rich testbed for robustness study of object detection. The dataset
+will be available at
+https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICCV2023,
+  https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LMEye: An Interactive Perception Network for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03701v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03701v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxin Li, Baotian Hu, Xinyu Chen, Lin Ma, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training a Large Visual Language Model (LVLM) from scratch, like GPT-4, is
+resource-intensive. Our paper presents a play-and-plug module for Large
+Language Models (LLMs), namely Interactive Perception Network (IPN), aiming to
+achieve a LVLM by incorporating the image understanding capability into LLMs.
+Previous methods incorporate visual information into LLMs with a simple visual
+mapping network, where the image feature is projected into the embedding space
+of LLMs via a linear layer. Such mapping network projects the image feature
+once yet does not consider the interaction between the image and the human
+input query. Hence, the obtained visual information with no connections with
+human intention may be inadequate for LLMs to make intention-following
+responses, which we term as static visual information. IPN addresses this issue
+by allowing the LLM to request the desired visual information aligned with
+various human instructions, which we term as the dynamic interaction between
+the LLM and visual information. Specifically, IPN consists of a simple visual
+mapping network to provide the basic perception of an image for LLMs. It also
+contains additional modules responsible for acquiring requests from LLMs,
+performing request-based visual information interaction, and transmitting the
+resulting interacted visual information to LLMs, respectively. In this way,
+LLMs act to understand the human query, deliver the corresponding request to
+the request-based visual information interaction module, and generate the
+response based on the interleaved multimodal information. We evaluate IPN
+through extensive experiments on multimodal question answering, reasoning, and
+so on, demonstrating that it significantly improves the zero-shot performance
+of LVLMs on various multimodal tasks compared to previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>working in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Diffusion Models with Explicit Transition Probability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13720v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13720v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Huang, Zheng Qin, Xinwang Liu, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities
+of generated content, however, they often suffer from complex forward
+processes, resulting in inefficient solutions for the reversed process and
+prolonged sampling times. In this paper, we aim to address the aforementioned
+challenges by focusing on the diffusion process itself that we propose to
+decouple the intricate diffusion process into two comparatively simpler process
+to improve the generative efficacy and speed. In particular, we present a novel
+diffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito
+diffusion process, in which the image distribution is approximated by an
+explicit transition probability while the noise path is controlled by the
+standard Wiener process. We find that decoupling the diffusion process reduces
+the learning difficulty and the explicit transition probability improves the
+generative speed significantly. We prove a new training objective for DPM,
+which enables the model to learn to predict the noise and image components
+separately. Moreover, given the novel forward diffusion equation, we derive the
+reverse denoising formula of DDM that naturally supports fewer steps of
+generation without ordinary differential equation (ODE) based accelerators. Our
+experiments demonstrate that DDM outperforms previous DPMs by a large margin in
+fewer function evaluations setting and gets comparable performances in long
+function evaluations setting. We also show that our framework can be applied to
+image-conditioned generation and high-resolution image synthesis, and that it
+can generate high-quality images with only 10 function evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ada-TTA: Towards Adaptive High-Quality Text-to-Talking Avatar Synthesis <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03504v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03504v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenhui Ye, Ziyue Jiang, Yi Ren, Jinglin Liu, Chen Zhang, Xiang Yin, Zejun Ma, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We are interested in a novel task, namely low-resource text-to-talking
+avatar. Given only a few-minute-long talking person video with the audio track
+as the training data and arbitrary texts as the driving input, we aim to
+synthesize high-quality talking portrait videos corresponding to the input
+text. This task has broad application prospects in the digital human industry
+but has not been technically achieved yet due to two challenges: (1) It is
+challenging to mimic the timbre from out-of-domain audio for a traditional
+multi-speaker Text-to-Speech system. (2) It is hard to render high-fidelity and
+lip-synchronized talking avatars with limited training data. In this paper, we
+introduce Adaptive Text-to-Talking Avatar (Ada-TTA), which (1) designs a
+generic zero-shot multi-speaker TTS model that well disentangles the text
+content, timbre, and prosody; and (2) embraces recent advances in neural
+rendering to achieve realistic audio-driven talking face video generation. With
+these designs, our method overcomes the aforementioned two challenges and
+achieves to generate identity-preserving speech and realistic talking person
+video. Experiments demonstrate that our method could synthesize realistic,
+identity-preserving, and audio-visual synchronized talking avatar videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2023 Workshop, 6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BeLFusion: Latent Diffusion for Behavior-Driven Human Motion Prediction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14304v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14304v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        German Barquero, Sergio Escalera, Cristina Palmero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic human motion prediction (HMP) has generally been tackled with
+generative adversarial networks and variational autoencoders. Most prior works
+aim at predicting highly diverse movements in terms of the skeleton joints'
+dispersion. This has led to methods predicting fast and motion-divergent
+movements, which are often unrealistic and incoherent with past motion. Such
+methods also neglect contexts that need to anticipate diverse low-range
+behaviors, or actions, with subtle joint displacements. To address these
+issues, we present BeLFusion, a model that, for the first time, leverages
+latent diffusion models in HMP to sample from a latent space where behavior is
+disentangled from pose and motion. As a result, diversity is encouraged from a
+behavioral perspective. Thanks to our behavior coupler's ability to transfer
+sampled behavior to ongoing motion, BeLFusion's predictions display a variety
+of behaviors that are significantly more realistic than the state of the art.
+To support it, we introduce two metrics, the Area of the Cumulative Motion
+Distribution, and the Average Pairwise Distance Error, which are correlated to
+our definition of realism according to a qualitative study with 126
+participants. Finally, we prove BeLFusion's generalization power in a new
+cross-dataset scenario for stochastic HMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Camera-ready version. Project page:
+  https://barquerogerman.github.io/BeLFusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting the Transferability of Adversarial Attacks with Global Momentum
+  Initialization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiafeng Wang, Zhaoyu Chen, Kaixun Jiang, Dingkang Yang, Lingyi Hong, Pinxue Guo, Haijing Guo, Wenqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are vulnerable to adversarial examples, which attach
+human invisible perturbations to benign inputs. Simultaneously, adversarial
+examples exhibit transferability under different models, which makes practical
+black-box attacks feasible. However, existing methods are still incapable of
+achieving desired transfer attack performance. In this work, from the
+perspective of gradient optimization and consistency, we analyze and discover
+the gradient elimination phenomenon as well as the local momentum optimum
+dilemma. To tackle these issues, we propose Global Momentum Initialization (GI)
+to suppress gradient elimination and help search for the global optimum.
+Specifically, we perform gradient pre-convergence before the attack and carry
+out a global search during the pre-convergence stage. Our method can be easily
+combined with almost all existing transfer methods, and we improve the success
+rate of transfer attacks significantly by an average of 6.4% under various
+advanced defense mechanisms compared to state-of-the-art methods. Eventually,
+we achieve an attack success rate of 95.4%, fully illustrating the insecurity
+of existing defense mechanisms. Code is available at
+$\href{https://github.com/Omenzychen/Global-Momentum-Initialization}{this\
+URL}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revise and release codes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16125v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16125v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Based on powerful Large Language Models (LLMs), recent generative Multimodal
+Large Language Models (MLLMs) have gained prominence as a pivotal research
+area, exhibiting remarkable capability for both comprehension and generation.
+In this work, we address the evaluation of generative comprehension in MLLMs as
+a preliminary step towards a comprehensive assessment of generative models, by
+introducing a benchmark named SEED-Bench. SEED-Bench consists of 19K multiple
+choice questions with accurate human annotations (x 6 larger than existing
+benchmarks), which spans 12 evaluation dimensions including the comprehension
+of both the image and video modality. We develop an advanced pipeline for
+generating multiple-choice questions that target specific evaluation
+dimensions, integrating both automatic filtering and manual verification
+processes. Multiple-choice questions with groundtruth options derived from
+human annotation enables an objective and efficient assessment of model
+performance, eliminating the need for human or GPT intervention during
+evaluation. We further evaluate the performance of 18 models across all 12
+dimensions, covering both the spatial and temporal understanding. By revealing
+the limitations of existing MLLMs through evaluation results, we aim for
+SEED-Bench to provide insights for motivating future research. We will launch
+and consistently maintain a leaderboard to provide a platform for the community
+to assess and investigate model capability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report; Project released at:
+  https://github.com/AILab-CVC/SEED-Bench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Attention-Based Soft Partition Network for Vehicle
+  Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.10401v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.10401v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangrok Lee, Taekang Woo, Sang Hun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle re-identification helps in distinguishing between images of the same
+and other vehicles. It is a challenging process because of significant
+intra-instance differences between identical vehicles from different views and
+subtle inter-instance differences between similar vehicles. To solve this
+issue, researchers have extracted view-aware or part-specific features via
+spatial attention mechanisms, which usually result in noisy attention maps or
+otherwise require expensive additional annotation for metadata, such as key
+points, to improve the quality. Meanwhile, based on the researchers' insights,
+various handcrafted multi-attention architectures for specific viewpoints or
+vehicle parts have been proposed. However, this approach does not guarantee
+that the number and nature of attention branches will be optimal for real-world
+re-identification tasks. To address these problems, we proposed a new vehicle
+re-identification network based on a multiple soft attention mechanism for
+capturing various discriminative regions from different viewpoints more
+efficiently. Furthermore, this model can significantly reduce the noise in
+spatial attention maps by devising a new method for creating an attention map
+for insignificant regions and then excluding it from generating the final
+result. We also combined a channel-wise attention mechanism with a spatial
+attention mechanism for the efficient selection of important semantic
+attributes for vehicle re-identification. Our experiments showed that our
+proposed model achieved a state-of-the-art performance among the
+attention-based methods without metadata and was comparable to the approaches
+using metadata for the VehicleID and VERI-Wild datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>-Based Tuning of <span class="highlight-title">Transformer</span> Models for Multi-Center Medical Image
+  Segmentation of Head and Neck Cancer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18948v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18948v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Numan Saeed, Muhammad Ridzuan, Roba Al Majzoub, Mohammad Yaqub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is a vital healthcare endeavor requiring precise
+and efficient models for appropriate diagnosis and treatment. Vision
+transformer (ViT)-based segmentation models have shown great performance in
+accomplishing this task. However, to build a powerful backbone, the
+self-attention block of ViT requires large-scale pre-training data. The present
+method of modifying pre-trained models entails updating all or some of the
+backbone parameters. This paper proposes a novel fine-tuning strategy for
+adapting a pretrained transformer-based segmentation model on data from a new
+medical center. This method introduces a small number of learnable parameters,
+termed prompts, into the input space (less than 1\% of model parameters) while
+keeping the rest of the model parameters frozen. Extensive studies employing
+data from new unseen medical centers show that the prompt-based fine-tuning of
+medical segmentation models provides excellent performance regarding the
+new-center data with a negligible drop regarding the old centers. Additionally,
+our strategy delivers great accuracy with minimum re-training on new-center
+data, significantly decreasing the computational and time costs of fine-tuning
+pre-trained models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ B-CANF: Adaptive B-frame Coding with Conditional Augmented Normalizing
+  Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.01769v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.01769v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mu-Jung Chen, Yi-Hsin Chen, Wen-Hsiao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, learning-based video compression has become an
+active research area. However, most works focus on P-frame coding. Learned
+B-frame coding is under-explored and more challenging. This work introduces a
+novel B-frame coding framework, termed B-CANF, that exploits conditional
+augmented normalizing flows for B-frame coding. B-CANF additionally features
+two novel elements: frame-type adaptive coding and B*-frames. Our frame-type
+adaptive coding learns better bit allocation for hierarchical B-frame coding by
+dynamically adapting the feature distributions according to the B-frame type.
+Our B*-frames allow greater flexibility in specifying the group-of-pictures
+(GOP) structure by reusing the B-frame codec to mimic P-frame coding, without
+the need for an additional, separate P-frame codec. On commonly used datasets,
+B-CANF achieves the state-of-the-art compression performance as compared to the
+other learned B-frame codecs and shows comparable BD-rate results to HM-16.23
+under the random access configuration in terms of PSNR. When evaluated on
+different GOP structures, our B*-frames achieve similar performance to the
+additional use of a separate P-frame codec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Circuits and Systems for Video
+  Technology (TCSVT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-informed Mutual Learning for Joint Medical Image
+  Classification and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10049v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10049v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Ren, Ke Zou, Xianjie Liu, Yidi Chen, Xuedong Yuan, Xiaojing Shen, Meng Wang, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classification and segmentation are crucial in medical image analysis as they
+enable accurate diagnosis and disease monitoring. However, current methods
+often prioritize the mutual learning features and shared model parameters,
+while neglecting the reliability of features and performances. In this paper,
+we propose a novel Uncertainty-informed Mutual Learning (UML) framework for
+reliable and interpretable medical image analysis. Our UML introduces
+reliability to joint classification and segmentation tasks, leveraging mutual
+learning with uncertainty to improve performance. To achieve this, we first use
+evidential deep learning to provide image-level and pixel-wise confidences.
+Then, an Uncertainty Navigator Decoder is constructed for better using mutual
+features and generating segmentation results. Besides, an Uncertainty
+Instructor is proposed to screen reliable masks for classification. Overall,
+UML could produce confidence estimation in features and performance for each
+link (classification and segmentation). The experiments on the public datasets
+demonstrate that our UML outperforms existing methods in terms of both accuracy
+and robustness. Our UML has the potential to explore the development of more
+reliable and explainable medical image analysis models. We will release the
+codes for reproduction after acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class-incremental Learning with Pre-allocated Fixed Classifiers <span class="chip">ICPR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.08657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.08657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Pernici, Matteo Bruni, Claudio Baecchi, Francesco Turchini, Alberto Del Bimbo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In class-incremental learning, a learning agent faces a stream of data with
+the goal of learning new classes while not forgetting previous ones. Neural
+networks are known to suffer under this setting, as they forget previously
+acquired knowledge. To address this problem, effective methods exploit past
+data stored in an episodic memory while expanding the final classifier nodes to
+accommodate the new classes.
+  In this work, we substitute the expanding classifier with a novel fixed
+classifier in which a number of pre-allocated output nodes are subject to the
+classification loss right from the beginning of the learning phase. Contrarily
+to the standard expanding classifier, this allows: (a) the output nodes of
+future unseen classes to firstly see negative samples since the beginning of
+learning together with the positive samples that incrementally arrive; (b) to
+learn features that do not change their geometric configuration as novel
+classes are incorporated in the learning model.
+  Experiments with public datasets show that the proposed approach is as
+effective as the expanding classifier while exhibiting novel intriguing
+properties of the internal feature representation that are otherwise
+not-existent. Our ablation study on pre-allocating a large number of classes
+further validates the approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICPR 2021 (figure fixed)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning and Evaluating Human Preferences for Conversational Head
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohan Zhou, Yalong Bai, Wei Zhang, Ting Yao, Tiejun Zhao, Tao Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A reliable and comprehensive evaluation metric that aligns with manual
+preference assessments is crucial for conversational head video synthesis
+methods development. Existing quantitative evaluations often fail to capture
+the full complexity of human preference, as they only consider limited
+evaluation dimensions. Qualitative evaluations and user studies offer a
+solution but are time-consuming and labor-intensive. This limitation hinders
+the advancement of conversational head generation algorithms and systems. In
+this paper, we propose a novel learning-based evaluation metric named
+Preference Score (PS) for fitting human preference according to the
+quantitative evaluations across different dimensions. PS can serve as a
+quantitative evaluation without the need for human annotation. Experimental
+results validate the superiority of Preference Score in aligning with human
+perception, and also demonstrate robustness and generalizability to unseen
+data, making it a valuable tool for advancing conversation head generation. We
+expect this metric could facilitate new advances in conversational head
+generation. Project Page: https://https://github.com/dc3ea9f/PreferenceScore.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMURF: Spatial Multi-Representation Fusion for 3D Object Detection with
+  4D Imaging Radar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10784v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10784v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Liu, Qiuchi Zhao, Weiyi Xiong, Tao Huang, Qing-Long Han, Bing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 4D Millimeter wave (mmWave) radar is a promising technology for vehicle
+sensing due to its cost-effectiveness and operability in adverse weather
+conditions. However, the adoption of this technology has been hindered by
+sparsity and noise issues in radar point cloud data. This paper introduces
+spatial multi-representation fusion (SMURF), a novel approach to 3D object
+detection using a single 4D imaging radar. SMURF leverages multiple
+representations of radar detection points, including pillarization and density
+features of a multi-dimensional Gaussian mixture distribution through kernel
+density estimation (KDE). KDE effectively mitigates measurement inaccuracy
+caused by limited angular resolution and multi-path propagation of radar
+signals. Additionally, KDE helps alleviate point cloud sparsity by capturing
+density features. Experimental evaluations on View-of-Delft (VoD) and
+TJ4DRadSet datasets demonstrate the effectiveness and generalization ability of
+SMURF, outperforming recently proposed 4D imaging radar-based
+single-representation models. Moreover, while using 4D imaging radar only,
+SMURF still achieves comparable performance to the state-of-the-art 4D imaging
+radar and camera fusion-based method, with an increase of 1.22% in the mean
+average precision on bird's-eye view of TJ4DRadSet dataset and 1.32% in the 3D
+mean average precision on the entire annotated area of VoD dataset. Our
+proposed method demonstrates impressive inference time and addresses the
+challenges of real-time detection, with the inference time no more than 0.05
+seconds for most scans on both datasets. This research highlights the benefits
+of 4D mmWave radar and is a strong benchmark for subsequent works regarding 3D
+object detection with 4D imaging radar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnyTeleop: A General Vision-Based Dexterous Robot Arm-Hand Teleoperation
+  System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04577v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04577v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhe Qin, Wei Yang, Binghao Huang, Karl Van Wyk, Hao Su, Xiaolong Wang, Yu-Wei Chao, Dieter Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based teleoperation offers the possibility to endow robots with
+human-level intelligence to physically interact with the environment, while
+only requiring low-cost camera sensors. However, current vision-based
+teleoperation systems are designed and engineered towards a particular robot
+model and deploy environment, which scales poorly as the pool of the robot
+models expands and the variety of the operating environment increases. In this
+paper, we propose AnyTeleop, a unified and general teleoperation system to
+support multiple different arms, hands, realities, and camera configurations
+within a single system. Although being designed to provide great flexibility to
+the choice of simulators and real hardware, our system can still achieve great
+performance. For real-world experiments, AnyTeleop can outperform a previous
+system that was designed for a specific robot hardware with a higher success
+rate, using the same robot. For teleoperation in simulation, AnyTeleop leads to
+better imitation learning performance, compared with a previous system that is
+particularly designed for that simulator. Project page: http://anyteleop.com/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>http://anyteleop.com/ Robotics: Science and Systems 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-Time Idling Vehicles Detection using Combined Audio-Visual Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14579v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14579v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiwen Li, Tristalee Mangin, Surojit Saha, Evan Blanchard, Dillon Tang, Henry Poppe, Nathan Searle, Ouk Choi, Kerry Kelly, Ross Whitaker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combustion vehicle emissions contribute to poor air quality and release
+greenhouse gases into the atmosphere, and vehicle pollution has been associated
+with numerous adverse health effects. Roadways with extensive waiting and/or
+passenger drop off, such as schools and hospital drop-off zones, can result in
+high incidence and density of idling vehicles. This can produce micro-climates
+of increased vehicle pollution. Thus, the detection of idling vehicles can be
+helpful in monitoring and responding to unnecessary idling and be integrated
+into real-time or off-line systems to address the resulting pollution. In this
+paper we present a real-time, dynamic vehicle idling detection algorithm. The
+proposed idle detection algorithm and notification rely on an algorithm to
+detect these idling vehicles. The proposed method relies on a multi-sensor,
+audio-visual, machine-learning workflow to detect idling vehicles visually
+under three conditions: moving, static with the engine on, and static with the
+engine off. The visual vehicle motion detector is built in the first stage, and
+then a contrastive-learning-based latent space is trained for classifying
+static vehicle engine sound. We test our system in real-time at a hospital
+drop-off point in Salt Lake City. This in-situ dataset was collected and
+annotated, and it includes vehicles of varying models and types. The
+experiments show that the method can detect engine switching on or off
+instantly and achieves 71.02 average precision (AP) for idle detections and
+91.06 for engine off detections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIM-OOD: Generative Masked Image Modelling for Out-of-Distribution
+  Detection in Medical Images <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Naval Marimont, Vasilis Siomos, Giacomo Tarroni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Out-of-Distribution (OOD) detection consists in identifying
+anomalous regions in images leveraging only models trained on images of healthy
+anatomy. An established approach is to tokenize images and model the
+distribution of tokens with Auto-Regressive (AR) models. AR models are used to
+1) identify anomalous tokens and 2) in-paint anomalous representations with
+in-distribution tokens. However, AR models are slow at inference time and prone
+to error accumulation issues which negatively affect OOD detection performance.
+Our novel method, MIM-OOD, overcomes both speed and error accumulation issues
+by replacing the AR model with two task-specific networks: 1) a transformer
+optimized to identify anomalous tokens and 2) a transformer optimized to
+in-paint anomalous tokens using masked image modelling (MIM). Our experiments
+with brain MRI anomalies show that MIM-OOD substantially outperforms AR models
+(DICE 0.458 vs 0.301) while achieving a nearly 25x speedup (9.5s vs 244s).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures. Accepted in DGM4MICCAI workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked and Swapped Sequence Modeling for Next Novel Basket
+  Recommendation in Grocery Shopping <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01308v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01308v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Mozhdeh Ariannezhad, Andrew Yates, Maarten de Rijke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Next basket recommendation (NBR) is the task of predicting the next set of
+items based on a sequence of already purchased baskets. It is a recommendation
+task that has been widely studied, especially in the context of grocery
+shopping. In next basket recommendation (NBR), it is useful to distinguish
+between repeat items, i.e., items that a user has consumed before, and explore
+items, i.e., items that a user has not consumed before. Most NBR work either
+ignores this distinction or focuses on repeat items. We formulate the next
+novel basket recommendation (NNBR) task, i.e., the task of recommending a
+basket that only consists of novel items, which is valuable for both real-world
+application and NBR evaluation. We evaluate how existing NBR methods perform on
+the NNBR task and find that, so far, limited progress has been made w.r.t. the
+NNBR task. To address the NNBR task, we propose a simple bi-directional
+transformer basket recommendation model (BTBR), which is focused on directly
+modeling item-to-item correlations within and across baskets instead of
+learning complex basket representations. To properly train BTBR, we propose and
+investigate several masking strategies and training objectives: (i) item-level
+random masking, (ii) item-level select masking, (iii) basket-level all masking,
+(iv) basket-level explore masking, and (v) joint masking. In addition, an
+item-basket swapping strategy is proposed to enrich the item interactions
+within the same baskets. We conduct extensive experiments on three open
+datasets with various characteristics. The results demonstrate the
+effectiveness of BTBR and our masking and swapping strategies for the NNBR
+task. BTBR with a properly selected masking and swapping strategy can
+substantially improve NNBR performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Popularity Bias in Recommender Systems <span class="chip">UAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Klimashevskaia, Dietmar Jannach, Mehdi Elahi, Christoph Trattner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems help people find relevant content in a personalized way.
+One main promise of such systems is that they are able to increase the
+visibility of items in the long tail, i.e., the lesser-known items in a
+catalogue. Existing research, however, suggests that in many situations today's
+recommendation algorithms instead exhibit a popularity bias, meaning that they
+often focus on rather popular items in their recommendations. Such a bias may
+not only lead to limited value of the recommendations for consumers and
+providers in the short run, but it may also cause undesired reinforcement
+effects over time. In this paper, we discuss the potential reasons for
+popularity bias and we review existing approaches to detect, quantify and
+mitigate popularity bias in recommender systems. Our survey therefore includes
+both an overview of the computational metrics used in the literature as well as
+a review of the main technical approaches to reduce the bias. We furthermore
+critically discuss today's literature, where we observe that the research is
+almost entirely based on computational experiments and on certain assumptions
+regarding the practical effects of including long-tail items in the
+recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review, submitted to UMUAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Better Query Classification with Multi-Expert Knowledge
+  Condensation in JD Ads Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun-Peng Ning, Ming Pang, Zheng Fang, Xue Jiang, Xi-Wei Zhao, Chang-Ping Peng, Zhan-Gang Lin, Jing-He Hu, Jing-Ping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search query classification, as an effective way to understand user intents,
+is of great importance in real-world online ads systems. To ensure a lower
+latency, a shallow model (e.g. FastText) is widely used for efficient online
+inference. However, the representation ability of the FastText model is
+insufficient, resulting in poor classification performance, especially on some
+low-frequency queries and tailed categories. Using a deeper and more complex
+model (e.g. BERT) is an effective solution, but it will cause a higher online
+inference latency and more expensive computing costs. Thus, how to juggle both
+inference efficiency and classification performance is obviously of great
+practical importance. To overcome this challenge, in this paper, we propose
+knowledge condensation (KC), a simple yet effective knowledge distillation
+framework to boost the classification performance of the online FastText model
+under strict low latency constraints. Specifically, we propose to train an
+offline BERT model to retrieve more potentially relevant data. Benefiting from
+its powerful semantic representation, more relevant labels not exposed in the
+historical data will be added into the training set for better FastText model
+training. Moreover, a novel distribution-diverse multi-expert learning strategy
+is proposed to further improve the mining ability of relevant data. By training
+multiple BERT models from different data distributions, it can respectively
+perform better at high, middle, and low-frequency search queries. The model
+ensemble from multi-distribution makes its retrieval ability more powerful. We
+have deployed two versions of this framework in JD search, and both offline
+experiments and online A/B testing from multiple datasets have validated the
+effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Similarity Search: Embracing Smarter Mechanisms over Smarter
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renzhi Wu, Jingfan Meng, Jie Jeff Xu, Huayi Wang, Kexin Rong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this vision paper, we propose a shift in perspective for improving the
+effectiveness of similarity search. Rather than focusing solely on enhancing
+the data quality, particularly machine learning-generated embeddings, we
+advocate for a more comprehensive approach that also enhances the underpinning
+search mechanisms. We highlight three novel avenues that call for a
+redefinition of the similarity search problem: exploiting implicit data
+structures and distributions, engaging users in an iterative feedback loop, and
+moving beyond a single query vector. These novel pathways have gained relevance
+in emerging applications such as large-scale language models, video clip
+retrieval, and data labeling. We discuss the corresponding research challenges
+posed by these new problem areas and share insights from our preliminary
+discoveries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User-Controllable Recommendation via Counterfactual Retrospective and
+  Prospective Explanations <span class="chip">ECAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Tan, Yingqiang Ge, Yan Zhu, Yinglong Xia, Jiebo Luo, Jianchao Ji, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern recommender systems utilize users' historical behaviors to generate
+personalized recommendations. However, these systems often lack user
+controllability, leading to diminished user satisfaction and trust in the
+systems. Acknowledging the recent advancements in explainable recommender
+systems that enhance users' understanding of recommendation mechanisms, we
+propose leveraging these advancements to improve user controllability. In this
+paper, we present a user-controllable recommender system that seamlessly
+integrates explainability and controllability within a unified framework. By
+providing both retrospective and prospective explanations through
+counterfactual reasoning, users can customize their control over the system by
+interacting with these explanations.
+  Furthermore, we introduce and assess two attributes of controllability in
+recommendation systems: the complexity of controllability and the accuracy of
+controllability. Experimental evaluations on MovieLens and Yelp datasets
+substantiate the effectiveness of our proposed framework. Additionally, our
+experiments demonstrate that offering users control options can potentially
+enhance recommendation accuracy in the future. Source code and data are
+available at \url{https://github.com/chrisjtan/ucr}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at 26th European Conference on Artificial
+  Intelligence (ECAI2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MultiEM: Efficient and Effective Unsupervised Multi-Table Entity
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocan Zeng, Pengfei Wang, Yuren Mao, Lu Chen, Xiaoze Liu, Yunjun Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching (EM), which aims to identify all entity pairs referring to
+the same real-world entity from relational tables, is one of the most important
+tasks in real-world data management systems. Due to the labeling process of EM
+being extremely labor-intensive, unsupervised EM is more applicable than
+supervised EM in practical scenarios. Traditional unsupervised EM assumes that
+all entities come from two tables; however, it is more common to match entities
+from multiple tables in practical applications, that is, multi-table entity
+matching (multi-table EM). Unfortunately, effective and efficient unsupervised
+multi-table EM remains under-explored. To fill this gap, this paper formally
+studies the problem of unsupervised multi-table entity matching and proposes an
+effective and efficient solution, termed as MultiEM. MultiEM is a parallelable
+pipeline of enhanced entity representation, table-wise hierarchical merging,
+and density-based pruning. Extensive experimental results on six real-world
+benchmark datasets demonstrate the superiority of MultiEM in terms of
+effectiveness and efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-aware Collaborative Filtering with <span class="highlight-title">Pre-train</span>ed Language Model
+  for Personalized <span class="highlight-title">Review</span>-based Rating Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanxiu Wang, Xinlei Cao, Jianyong Wang, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized review-based rating prediction aims at leveraging existing
+reviews to model user interests and item characteristics for rating prediction.
+Most of the existing studies mainly encounter two issues. First, the rich
+knowledge contained in the fine-grained aspects of each review and the
+knowledge graph is rarely considered to complement the pure text for better
+modeling user-item interactions. Second, the power of pre-trained language
+models is not carefully studied for personalized review-based rating
+prediction. To address these issues, we propose an approach named
+Knowledge-aware Collaborative Filtering with Pre-trained Language Model
+(KCF-PLM). For the first issue, to utilize rich knowledge, KCF-PLM develops a
+transformer network to model the interactions of the extracted aspects w.r.t. a
+user-item pair. For the second issue, to better represent users and items,
+KCF-PLM takes all the historical reviews of a user or an item as input to
+pre-trained language models. Moreover, KCF-PLM integrates the transformer
+network and the pre-trained language models through representation propagation
+on the knowledge graph and user-item guided attention of the aspect
+representations. Thus KCF-PLM combines review text, aspect, knowledge graph,
+and pre-trained language models together for review-based rating prediction. We
+conduct comprehensive experiments on several public datasets, demonstrating the
+effectiveness of KCF-PLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, accepted by IEEE TKDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AsdKB: A Chinese Knowledge Base for the Early Screening and Diagnosis of
+  Autism Spectrum Disorder <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxing Wu, Xudong Cao, Yipeng Zhu, Feiyue Wu, Tianling Gong, Yuxiang Wang, Shenqi Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To easily obtain the knowledge about autism spectrum disorder and help its
+early screening and diagnosis, we create AsdKB, a Chinese knowledge base on
+autism spectrum disorder. The knowledge base is built on top of various
+sources, including 1) the disease knowledge from SNOMED CT and ICD-10 clinical
+descriptions on mental and behavioural disorders, 2) the diagnostic knowledge
+from DSM-5 and different screening tools recommended by social organizations
+and medical institutes, and 3) the expert knowledge on professional physicians
+and hospitals from the Web. AsdKB contains both ontological and factual
+knowledge, and is accessible as Linked Data at https://w3id.org/asdkb/. The
+potential applications of AsdKB are question answering, auxiliary diagnosis,
+and expert recommendation, and we illustrate them with a prototype which can be
+accessed at http://asdkb.org.cn/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, Accepted by the Resource Track of ISWC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models are Strong Zero-Shot Retriever 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Shen, Guodong Long, Xiubo Geng, Chongyang Tao, Tianyi Zhou, Daxin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a simple method that applies a large language model
+(LLM) to large-scale retrieval in zero-shot scenarios. Our method, the Language
+language model as Retriever (LameR), is built upon no other neural models but
+an LLM, while breaking brute-force combinations of retrievers with LLMs and
+lifting the performance of zero-shot retrieval to be very competitive on
+benchmark datasets. Essentially, we propose to augment a query with its
+potential answers by prompting LLMs with a composition of the query and the
+query's in-domain candidates. The candidates, regardless of correct or wrong,
+are obtained by a vanilla retrieval procedure on the target collection. As a
+part of the prompts, they are likely to help LLM generate more precise answers
+by pattern imitation or candidate summarization. Even if all the candidates are
+wrong, the prompts at least make LLM aware of in-collection patterns and
+genres. Moreover, due to the low performance of a self-supervised retriever,
+the LLM-based query augmentation becomes less effective as the retriever
+bottlenecks the whole pipeline. Therefore, we propose to leverage a
+non-parametric lexicon-based method (e.g., BM25) as the retrieval module to
+capture query-document overlap in a literal fashion. As such, LameR makes the
+retrieval procedure transparent to the LLM, thus circumventing the performance
+bottleneck.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">119</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ More Context, Less Distraction: Visual Classification by Inferring and
+  Conditioning on Contextual Attributes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang An, Sicheng Zhu, Michael-Andrei Panaitescu-Liess, Chaithanya Kumar Mummadi, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP, as a foundational vision language model, is widely used in zero-shot
+image classification due to its ability to understand various visual concepts
+and natural language descriptions. However, how to fully leverage CLIP's
+unprecedented human-like understanding capabilities to achieve better zero-shot
+classification is still an open question. This paper draws inspiration from the
+human visual perception process: a modern neuroscience view suggests that in
+classifying an object, humans first infer its class-independent attributes
+(e.g., background and orientation) which help separate the foreground object
+from the background, and then make decisions based on this information.
+Inspired by this, we observe that providing CLIP with contextual attributes
+improves zero-shot classification and mitigates reliance on spurious features.
+We also observe that CLIP itself can reasonably infer the attributes from an
+image. With these observations, we propose a training-free, two-step zero-shot
+classification method named PerceptionCLIP. Given an image, it first infers
+contextual attributes (e.g., background) and then performs object
+classification conditioning on them. Our experiments show that PerceptionCLIP
+achieves better generalization, group robustness, and better interpretability.
+For example, PerceptionCLIP with ViT-L/14 improves the worst group accuracy by
+16.5% on the Waterbirds dataset and by 3.5% on CelebA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lode Encoder: AI-constrained co-creativity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debosmita Bhaumik, Ahmed Khalifa, Julian Togelius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Lode Encoder, a gamified mixed-initiative level creation system
+for the classic platform-puzzle game Lode Runner. The system is built around
+several autoencoders which are trained on sets of Lode Runner levels. When fed
+with the user's design, each autoencoder produces a version of that design
+which is closer in style to the levels that it was trained on. The Lode Encoder
+interface allows the user to build and edit levels through 'painting' from the
+suggestions provided by the autoencoders. Crucially, in order to encourage
+designers to explore new possibilities, the system does not include more
+traditional editing tools. We report on the system design and training
+procedure, as well as on the evolution of the system itself and user tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked and Swapped Sequence Modeling for Next Novel Basket
+  Recommendation in Grocery Shopping <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01308v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01308v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Mozhdeh Ariannezhad, Andrew Yates, Maarten de Rijke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Next basket recommendation (NBR) is the task of predicting the next set of
+items based on a sequence of already purchased baskets. It is a recommendation
+task that has been widely studied, especially in the context of grocery
+shopping. In next basket recommendation (NBR), it is useful to distinguish
+between repeat items, i.e., items that a user has consumed before, and explore
+items, i.e., items that a user has not consumed before. Most NBR work either
+ignores this distinction or focuses on repeat items. We formulate the next
+novel basket recommendation (NNBR) task, i.e., the task of recommending a
+basket that only consists of novel items, which is valuable for both real-world
+application and NBR evaluation. We evaluate how existing NBR methods perform on
+the NNBR task and find that, so far, limited progress has been made w.r.t. the
+NNBR task. To address the NNBR task, we propose a simple bi-directional
+transformer basket recommendation model (BTBR), which is focused on directly
+modeling item-to-item correlations within and across baskets instead of
+learning complex basket representations. To properly train BTBR, we propose and
+investigate several masking strategies and training objectives: (i) item-level
+random masking, (ii) item-level select masking, (iii) basket-level all masking,
+(iv) basket-level explore masking, and (v) joint masking. In addition, an
+item-basket swapping strategy is proposed to enrich the item interactions
+within the same baskets. We conduct extensive experiments on three open
+datasets with various characteristics. The results demonstrate the
+effectiveness of BTBR and our masking and swapping strategies for the NNBR
+task. BTBR with a properly selected masking and swapping strategy can
+substantially improve NNBR performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BRNES: Enabling Security and Privacy-aware Experience Sharing in
+  Multiagent Robotic and Autonomous Systems <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Tamjid Hossain, Hung Manh La, Shahriar Badsha, Anton Netchaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although experience sharing (ES) accelerates multiagent reinforcement
+learning (MARL) in an advisor-advisee framework, attempts to apply ES to
+decentralized multiagent systems have so far relied on trusted environments and
+overlooked the possibility of adversarial manipulation and inference.
+Nevertheless, in a real-world setting, some Byzantine attackers, disguised as
+advisors, may provide false advice to the advisee and catastrophically degrade
+the overall learning performance. Also, an inference attacker, disguised as an
+advisee, may conduct several queries to infer the advisors' private information
+and make the entire ES process questionable in terms of privacy leakage. To
+address and tackle these issues, we propose a novel MARL framework (BRNES) that
+heuristically selects a dynamic neighbor zone for each advisee at each learning
+step and adopts a weighted experience aggregation technique to reduce Byzantine
+attack impact. Furthermore, to keep the agent's private information safe from
+adversarial inference attacks, we leverage the local differential privacy
+(LDP)-induced noise during the ES process. Our experiments show that our
+framework outperforms the state-of-the-art in terms of the steps to goal,
+obtained reward, and time to goal metrics. Particularly, our evaluation shows
+that the proposed framework is 8.32x faster than the current non-private
+frameworks and 1.41x faster than the private frameworks in an adversarial
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 3 tables, Accepted for publication in the
+  proceeding of The 2023 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2023), Oct 01-05, 2023, Detroit, Michigan, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Probabilistic Approach to <span class="highlight-title">Self-Supervised</span> Learning using Cyclical
+  Stochastic Gradient MCMC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoumeh Javanbakhat, Christoph Lippert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we present a practical Bayesian self-supervised learning method
+with Cyclical Stochastic Gradient Hamiltonian Monte Carlo (cSGHMC). Within this
+framework, we place a prior over the parameters of a self-supervised learning
+model and use cSGHMC to approximate the high dimensional and multimodal
+posterior distribution over the embeddings. By exploring an expressive
+posterior over the embeddings, Bayesian self-supervised learning produces
+interpretable and diverse representations. Marginalizing over these
+representations yields a significant gain in performance, calibration and
+out-of-distribution detection on a variety of downstream classification tasks.
+We provide experimental results on multiple classification tasks on four
+challenging datasets. Moreover, we demonstrate the effectiveness of the
+proposed method in out-of-distribution detection using the SVHN and CIFAR-10
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tirtha -- An Automated Platform to Crowdsource Images and Create 3D
+  Models of Heritage Sites 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jyotirmaya Shivottam, Subhankar Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digital preservation of Cultural Heritage (CH) sites is crucial to protect
+them against damage from natural disasters or human activities. Creating 3D
+models of CH sites has become a popular method of digital preservation thanks
+to advancements in computer vision and photogrammetry. However, the process is
+time-consuming, expensive, and typically requires specialized equipment and
+expertise, posing challenges in resource-limited developing countries.
+Additionally, the lack of an open repository for 3D models hinders research and
+public engagement with their heritage. To address these issues, we propose
+Tirtha, a web platform for crowdsourcing images of CH sites and creating their
+3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and
+Multi-View Stereo (MVS) techniques. It is modular, extensible and
+cost-effective, allowing for the incorporation of new techniques as
+photogrammetry advances. Tirtha is accessible through a web interface at
+https://tirtha.niser.ac.in and can be deployed on-premise or in a cloud
+environment. In our case studies, we demonstrate the pipeline's effectiveness
+by creating 3D models of temples in Odisha, India, using crowdsourced images.
+These models are available for viewing, interaction, and download on the Tirtha
+website. Our work aims to provide a dataset of crowdsourced images and 3D
+reconstructions for research in computer vision, heritage conservation, and
+related domains. Overall, Tirtha is a step towards democratizing digital
+preservation, primarily in resource-limited developing countries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 28th International ACM Conference on 3D Web
+  Technology (Web3D 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Multilingual Language Models Think Better in English? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julen Etxaniz, Gorka Azkune, Aitor Soroa, Oier Lopez de Lacalle, Mikel Artetxe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Translate-test is a popular technique to improve the performance of
+multilingual language models. This approach works by translating the input into
+English using an external machine translation system, and running inference
+over the translated input. However, these improvements can be attributed to the
+use of a separate translation system, which is typically trained on large
+amounts of parallel data not seen by the language model. In this work, we
+introduce a new approach called self-translate, which overcomes the need of an
+external translation system by leveraging the few-shot translation capabilities
+of multilingual language models. Experiments over 5 tasks show that
+self-translate consistently outperforms direct inference, demonstrating that
+language models are unable to leverage their full multilingual potential when
+prompted in non-English languages. Our code is available at
+https://github.com/juletx/self-translate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Calibration in Deep Learning: A <span class="highlight-title">Survey</span> of the State-of-the-Art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Calibrating deep neural models plays an important role in building reliable,
+robust AI systems in safety-critical applications. Recent work has shown that
+modern neural networks that possess high predictive capability are poorly
+calibrated and produce unreliable model predictions. Though deep learning
+models achieve remarkable performance on various benchmarks, the study of model
+calibration and reliability is relatively underexplored. Ideal deep models
+should have not only high predictive performance but also be well calibrated.
+There have been some recent methods proposed to calibrate deep models by using
+different mechanisms. In this survey, we review the state-of-the-art
+calibration methods and provide an understanding of their principles for
+performing model calibration. First, we start with the definition of model
+calibration and explain the root causes of model miscalibration. Then we
+introduce the key metrics that can measure this aspect. It is followed by a
+summary of calibration methods that we roughly classified into four categories:
+post-hoc calibration, regularization methods, uncertainty estimation, and
+composition methods. We also covered some recent advancements in calibrating
+large models, particularly large language models (LLMs). Finally, we discuss
+some open issues, challenges, and potential directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using ScrutinAI for Visual Inspection of DNN Performance in a Medical
+  Use Case <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebekka Görge, Elena Haedecke, Michael Mock
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our Visual Analytics (VA) tool ScrutinAI supports human analysts to
+investigate interactively model performanceand data sets. Model performance
+depends on labeling quality to a large extent. In particular in medical
+settings, generation of high quality labels requires in depth expert knowledge
+and is very costly. Often, data sets are labeled by collecting opinions of
+groups of experts. We use our VA tool to analyse the influence of label
+variations between different experts on the model performance. ScrutinAI
+facilitates to perform a root cause analysis that distinguishes weaknesses of
+deep neural network (DNN) models caused by varying or missing labeling quality
+from true weaknesses. We scrutinize the overall detection of intracranial
+hemorrhages and the more subtle differentiation between subtypes in a publicly
+available data set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AAAI Spring Symposium 2023 AITA: AI Trustworthiness
+  Assessment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Hierarchical Neural Networks using Hierarchical Softmax 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jetze Schuurmans, Flavius Frasincar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a framework in which hierarchical softmax is used to
+create a global hierarchical classifier. The approach is applicable for any
+classification task where there is a natural hierarchy among classes. We show
+empirical results on four text classification datasets. In all datasets the
+hierarchical softmax improved on the regular softmax used in a flat classifier
+in terms of macro-F1 and macro-recall. In three out of four datasets
+hierarchical softmax achieved a higher micro-accuracy and macro-precision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 35th Symposium on Applied Computing (SAC'20,
+  https://www.sigapp.org/sac/sac2020/), to the Machine Learning and its
+  Applications track (MLA, https://sites.google.com/view/acmsac2020/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Noisy-Label Learning by Implicit Dicriminative Approximation
+  with Partial Label Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengbei Liu, Yuanhong Chen, Chong Wang, Yuyuan Liu, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The learning with noisy labels has been addressed with both discriminative
+and generative models. Although discriminative models have dominated the field
+due to their simpler modeling and more efficient computational training
+processes, generative models offer a more effective means of disentangling
+clean and noisy labels and improving the estimation of the label transition
+matrix. However, generative approaches maximize the joint likelihood of noisy
+labels and data using a complex formulation that only indirectly optimizes the
+model of interest associating data and clean labels. Additionally, these
+approaches rely on generative models that are challenging to train and tend to
+use uninformative clean label priors. In this paper, we propose a new
+generative noisy-label learning approach that addresses these three issues.
+First, we propose a new model optimisation that directly associates data and
+clean labels. Second, the generative model is implicitly estimated using a
+discriminative model, eliminating the inefficient training of a generative
+model. Third, we propose a new informative label prior inspired by partial
+label learning as supervision signal for noisy label learning. Extensive
+experiments on several noisy-label benchmarks demonstrate that our generative
+model provides state-of-the-art results while maintaining a similar
+computational complexity as discriminative models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Direct Gradient Temporal Difference Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochi Qian, Shangtong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Off-policy learning enables a reinforcement learning (RL) agent to reason
+counterfactually about policies that are not executed and is one of the most
+important ideas in RL. It, however, can lead to instability when combined with
+function approximation and bootstrapping, two arguably indispensable
+ingredients for large-scale reinforcement learning. This is the notorious
+deadly triad. Gradient Temporal Difference (GTD) is one powerful tool to solve
+the deadly triad. Its success results from solving a doubling sampling issue
+indirectly with weight duplication or Fenchel duality. In this paper, we
+instead propose a direct method to solve the double sampling issue by simply
+using two samples in a Markovian data stream with an increasing gap. The
+resulting algorithm is as computationally efficient as GTD but gets rid of
+GTD's extra weights. The only price we pay is a logarithmically increasing
+memory as time progresses. We provide both asymptotic and finite sample
+analysis, where the convergence rate is on-par with the canonical on-policy
+temporal difference learning. Key to our analysis is a novel refined
+discretization of limiting ODEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to JMLR in Apr 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLMs Understand Glass-Box Models, Discover Surprises, and Suggest
+  Repairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin J. Lengerich, Sebastian Bordt, Harsha Nori, Mark E. Nunnally, Yin Aphinyanaphongs, Manolis Kellis, Rich Caruana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that large language models (LLMs) are remarkably good at working with
+interpretable models that decompose complex outcomes into univariate
+graph-represented components. By adopting a hierarchical approach to reasoning,
+LLMs can provide comprehensive model-level summaries without ever requiring the
+entire model to fit in context. This approach enables LLMs to apply their
+extensive background knowledge to automate common tasks in data science such as
+detecting anomalies that contradict prior knowledge, describing potential
+reasons for the anomalies, and suggesting repairs that would remove the
+anomalies. We use multiple examples in healthcare to demonstrate the utility of
+these new capabilities of LLMs, with particular emphasis on Generalized
+Additive Models (GAMs). Finally, we present the package $\texttt{TalkToEBM}$ as
+an open-source LLM-GAM interface.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DySTreSS: Dynamically Scaled Temperature in <span class="highlight-title">Self-Supervised</span> Contrastive
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01140v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01140v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siladittya Manna, Soumitri Chattopadhyay, Rakesh Dey, Saumik Bhattacharya, Umapada Pal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contemporary self-supervised contrastive algorithms like SimCLR, MoCo,
+etc., the task of balancing attraction between two semantically similar samples
+and repulsion between two samples from different classes is primarily affected
+by the presence of hard negative samples. While the InfoNCE loss has been shown
+to impose penalties based on hardness, the temperature hyper-parameter is the
+key to regulating the penalties and the trade-off between uniformity and
+tolerance. In this work, we focus our attention to improve the performance of
+InfoNCE loss in SSL by studying the effect of temperature hyper-parameter
+values. We propose a cosine similarity-dependent temperature scaling function
+to effectively optimize the distribution of the samples in the feature space.
+We further analyze the uniformity and tolerance metrics to investigate the
+optimal regions in the cosine similarity space for better optimization.
+Additionally, we offer a comprehensive examination of the behavior of local and
+global structures in the feature space throughout the pre-training phase, as
+the temperature varies. Experimental evidence shows that the proposed framework
+outperforms or is at par with the contrastive loss-based SSL algorithms. We
+believe our work (DySTreSS) on temperature scaling in SSL provides a foundation
+for future research in contrastive learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Privacy Allocation for Locally Differentially Private Federated
+  Learning with Composite Objectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaojiao Zhang, Dominik Fay, Mikael Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a locally differentially private federated learning
+algorithm for strongly convex but possibly nonsmooth problems that protects the
+gradients of each worker against an honest but curious server. The proposed
+algorithm adds artificial noise to the shared information to ensure privacy and
+dynamically allocates the time-varying noise variance to minimize an upper
+bound of the optimization error subject to a predefined privacy budget
+constraint. This allows for an arbitrarily large but finite number of
+iterations to achieve both privacy protection and utility up to a neighborhood
+of the optimal solution, removing the need for tuning the number of iterations.
+Numerical results show the superiority of the proposed algorithm over
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can We Transfer Noise Patterns? An Multi-environment Spectrum Analysis
+  Model Using Generated Cases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiwen Du, Zheng Ju, Yu An, Honghui Du, Dongjie Zhu, Zhaoshuo Tian, Aonghus Lawlor, Ruihai Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectrum analysis systems in online water quality testing are designed to
+detect types and concentrations of pollutants and enable regulatory agencies to
+respond promptly to pollution incidents. However, spectral data-based testing
+devices suffer from complex noise patterns when deployed in non-laboratory
+environments. To make the analysis model applicable to more environments, we
+propose a noise patterns transferring model, which takes the spectrum of
+standard water samples in different environments as cases and learns the
+differences in their noise patterns, thus enabling noise patterns to transfer
+to unknown samples. Unfortunately, the inevitable sample-level baseline noise
+makes the model unable to obtain the paired data that only differ in
+dataset-level environmental noise. To address the problem, we generate a
+sample-to-sample case-base to exclude the interference of sample-level noise on
+dataset-level noise learning, enhancing the system's learning performance.
+Experiments on spectral data with different background noises demonstrate the
+good noise-transferring ability of the proposed method against baseline systems
+ranging from wavelet denoising, deep neural networks, and generative models.
+From this research, we posit that our method can enhance the performance of DL
+models by generating high-quality cases. The source code is made publicly
+available online at https://github.com/Magnomic/CNST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-task learning for classification, segmentation, reconstruction,
+  and detection on chest CT scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weronika Hryniewska-Guzik, Maria Kędzierska, Przemysław Biecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lung cancer and covid-19 have one of the highest morbidity and mortality
+rates in the world. For physicians, the identification of lesions is difficult
+in the early stages of the disease and time-consuming. Therefore, multi-task
+learning is an approach to extracting important features, such as lesions, from
+small amounts of medical data because it learns to generalize better. We
+propose a novel multi-task framework for classification, segmentation,
+reconstruction, and detection. To the best of our knowledge, we are the first
+ones who added detection to the multi-task solution. Additionally, we checked
+the possibility of using two different backbones and different loss functions
+in the segmentation task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>presented at the Polish Conference on Artificial Intelligence
+  (PP-RAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlearning Spurious Correlations in Chest X-ray Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Misgina Tsighe Hagos, Kathleen M. Curran, Brian Mac Namee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image classification models are frequently trained using training
+datasets derived from multiple data sources. While leveraging multiple data
+sources is crucial for achieving model generalization, it is important to
+acknowledge that the diverse nature of these sources inherently introduces
+unintended confounders and other challenges that can impact both model accuracy
+and transparency. A notable confounding factor in medical image classification,
+particularly in musculoskeletal image classification, is skeletal
+maturation-induced bone growth observed during adolescence. We train a deep
+learning model using a Covid-19 chest X-ray dataset and we showcase how this
+dataset can lead to spurious correlations due to unintended confounding
+regions. eXplanation Based Learning (XBL) is a deep learning approach that goes
+beyond interpretability by utilizing model explanations to interactively
+unlearn spurious correlations. This is achieved by integrating interactive user
+feedback, specifically feature annotations. In our study, we employed two
+non-demanding manual feedback mechanisms to implement an XBL-based approach for
+effectively eliminating these spurious correlations. Our results underscore the
+promising potential of XBL in constructing robust models even in the presence
+of confounding factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Discovery Science 2023 conference. arXiv admin note:
+  text overlap with arXiv:2307.06026</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Popularity Bias in Recommender Systems <span class="chip">UAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Klimashevskaia, Dietmar Jannach, Mehdi Elahi, Christoph Trattner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems help people find relevant content in a personalized way.
+One main promise of such systems is that they are able to increase the
+visibility of items in the long tail, i.e., the lesser-known items in a
+catalogue. Existing research, however, suggests that in many situations today's
+recommendation algorithms instead exhibit a popularity bias, meaning that they
+often focus on rather popular items in their recommendations. Such a bias may
+not only lead to limited value of the recommendations for consumers and
+providers in the short run, but it may also cause undesired reinforcement
+effects over time. In this paper, we discuss the potential reasons for
+popularity bias and we review existing approaches to detect, quantify and
+mitigate popularity bias in recommender systems. Our survey therefore includes
+both an overview of the computational metrics used in the literature as well as
+a review of the main technical approaches to reduce the bias. We furthermore
+critically discuss today's literature, where we observe that the research is
+almost entirely based on computational experiments and on certain assumptions
+regarding the practical effects of including long-tail items in the
+recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review, submitted to UMUAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiexin Wang, Yujie Zhou, Wenwen Qiang, Ying Ba, Bing Su, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion prediction (HMP) has emerged as a popular research topic due to
+its diverse applications, but it remains a challenging task due to the
+stochastic and aperiodic nature of future poses. Traditional methods rely on
+hand-crafted features and machine learning techniques, which often struggle to
+model the complex dynamics of human motion. Recent deep learning-based methods
+have achieved success by learning spatio-temporal representations of motion,
+but these models often overlook the reliability of motion data. Additionally,
+the temporal and spatial dependencies of skeleton nodes are distinct. The
+temporal relationship captures motion information over time, while the spatial
+relationship describes body structure and the relationships between different
+nodes. In this paper, we propose a novel spatio-temporal branching network
+using incremental information for HMP, which decouples the learning of
+temporal-domain and spatial-domain features, extracts more motion information,
+and achieves complementary cross-domain knowledge learning through knowledge
+distillation. Our approach effectively reduces noise interference and provides
+more expressive information for characterizing motion by separately extracting
+temporal and spatial features. We evaluate our approach on standard HMP
+benchmarks and outperform state-of-the-art methods in terms of prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Homography Estimation in Complex Topological Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giacomo D'Amicantonio, Egor Bondarau, Peter H. N. De With
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surveillance videos and images are used for a broad set of applications,
+ranging from traffic analysis to crime detection. Extrinsic camera calibration
+data is important for most analysis applications. However, security cameras are
+susceptible to environmental conditions and small camera movements, resulting
+in a need for an automated re-calibration method that can account for these
+varying conditions. In this paper, we present an automated camera-calibration
+process leveraging a dictionary-based approach that does not require prior
+knowledge on any camera settings. The method consists of a custom
+implementation of a Spatial Transformer Network (STN) and a novel topological
+loss function. Experiments reveal that the proposed method improves the IoU
+metric by up to 12% w.r.t. a state-of-the-art model across five synthetic
+datasets and the World Cup 2014 dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Will be published in Intelligent Vehicle Symposium 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Driven Identification of Quadratic Symplectic Representations of
+  Nonlinear Hamiltonian Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Süleyman Yildiz, Pawan Goyal, Thomas Bendokat, Peter Benner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a framework for learning Hamiltonian systems using data. This work
+is based on the lifting hypothesis, which posits that nonlinear Hamiltonian
+systems can be written as nonlinear systems with cubic Hamiltonians. By
+leveraging this, we obtain quadratic dynamics that are Hamiltonian in a
+transformed coordinate system. To that end, for given generalized position and
+momentum data, we propose a methodology to learn quadratic dynamical systems,
+enforcing the Hamiltonian structure in combination with a symplectic
+auto-encoder. The enforced Hamiltonian structure exhibits long-term stability
+of the system, while the cubic Hamiltonian function provides relatively low
+model complexity. For low-dimensional data, we determine a higher-order
+transformed coordinate system, whereas, for high-dimensional data, we find a
+lower-order coordinate system with the desired properties. We demonstrate the
+proposed methodology by means of both low-dimensional and high-dimensional
+nonlinear Hamiltonian systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practical Deep Learning-Based Acoustic Side Channel Attack on
+  Keyboards <span class="chip">EuroS&P</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Harrison, Ehsan Toreini, Maryam Mehrnezhad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With recent developments in deep learning, the ubiquity of micro-phones and
+the rise in online services via personal devices, acoustic side channel attacks
+present a greater threat to keyboards than ever. This paper presents a
+practical implementation of a state-of-the-art deep learning model in order to
+classify laptop keystrokes, using a smartphone integrated microphone. When
+trained on keystrokes recorded by a nearby phone, the classifier achieved an
+accuracy of 95%, the highest accuracy seen without the use of a language model.
+When trained on keystrokes recorded using the video-conferencing software Zoom,
+an accuracy of 93% was achieved, a new best for the medium. Our results prove
+the practicality of these side channel attacks via off-the-shelf equipment and
+algorithms. We discuss a series of mitigation methods to protect users against
+these series of attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was already accepted in 2023 IEEE European Symposium on
+  Security and Privacy Workshop, SiLM'23 (EuroS&PW)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Feature Engineering for Time Series Classification: Evaluation
+  and Discussion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aurélien Renault, Alexis Bondu, Vincent Lemaire, Dominique Gay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time Series Classification (TSC) has received much attention in the past two
+decades and is still a crucial and challenging problem in data science and
+knowledge engineering. Indeed, along with the increasing availability of time
+series data, many TSC algorithms have been suggested by the research community
+in the literature. Besides state-of-the-art methods based on similarity
+measures, intervals, shapelets, dictionaries, deep learning methods or hybrid
+ensemble methods, several tools for extracting unsupervised informative summary
+statistics, aka features, from time series have been designed in the recent
+years. Originally designed for descriptive analysis and visualization of time
+series with informative and interpretable features, very few of these feature
+engineering tools have been benchmarked for TSC problems and compared with
+state-of-the-art TSC algorithms in terms of predictive performance. In this
+article, we aim at filling this gap and propose a simple TSC process to
+evaluate the potential predictive performance of the feature sets obtained with
+existing feature engineering tools. Thus, we present an empirical study of 11
+feature engineering tools branched with 9 supervised classifiers over 112 time
+series data sets. The analysis of the results of more than 10000 learning
+experiments indicate that feature-based methods perform as accurately as
+current state-of-the-art TSC algorithms, and thus should rightfully be
+considered further in the TSC literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Analytic Calculus Cracks AdaBoost Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Marc Brossier, Olivier Lafitte, Lenny Réthoré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The principle of boosting in supervised learning involves combining multiple
+weak classifiers to obtain a stronger classifier. AdaBoost has the reputation
+to be a perfect example of this approach. We have previously shown that
+AdaBoost is not truly an optimization algorithm. This paper shows that AdaBoost
+is an algorithm in name only, as the resulting combination of weak classifiers
+can be explicitly calculated using a truth table. This study is carried out by
+considering a problem with two classes and is illustrated by the particular
+case of three binary classifiers and presents results in comparison with those
+from the implementation of AdaBoost algorithm of the Python library
+scikit-learn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Anomaly Detection at Group Level: A Topology Pattern Enhanced
+  Unsupervised Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Ai, Jialong Zhou, Yulin Zhu, Gaolei Li, Tomasz P. Michalak, Xiapu Luo, Kai Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph anomaly detection (GAD) has achieved success and has been widely
+applied in various domains, such as fraud detection, cybersecurity, finance
+security, and biochemistry. However, existing graph anomaly detection
+algorithms focus on distinguishing individual entities (nodes or graphs) and
+overlook the possibility of anomalous groups within the graph. To address this
+limitation, this paper introduces a novel unsupervised framework for a new task
+called Group-level Graph Anomaly Detection (Gr-GAD). The proposed framework
+first employs a variant of Graph AutoEncoder (GAE) to locate anchor nodes that
+belong to potential anomaly groups by capturing long-range inconsistencies.
+Subsequently, group sampling is employed to sample candidate groups, which are
+then fed into the proposed Topology Pattern-based Graph Contrastive Learning
+(TPGCL) method. TPGCL utilizes the topology patterns of groups as clues to
+generate embeddings for each candidate group and thus distinct anomaly groups.
+The experimental results on both real-world and synthetic datasets demonstrate
+that the proposed framework shows superior performance in identifying and
+localizing anomaly groups, highlighting it as a promising solution for Gr-GAD.
+Datasets and codes of the proposed framework are at the github repository
+https://anonymous.4open.science/r/Topology-Pattern-Enhanced-Unsupervised-Group-level-Graph-Anomaly-Detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulation-based inference using surjective sequential neural likelihood
+  estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Dirmeier, Carlo Albert, Fernando Perez-Cruz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Surjective Sequential Neural Likelihood (SSNL) estimation, a novel
+method for simulation-based inference in models where the evaluation of the
+likelihood function is not tractable and only a simulator that can generate
+synthetic data is available. SSNL fits a dimensionality-reducing surjective
+normalizing flow model and uses it as a surrogate likelihood function which
+allows for conventional Bayesian inference using either Markov chain Monte
+Carlo methods or variational inference. By embedding the data in a
+low-dimensional space, SSNL solves several issues previous likelihood-based
+methods had when applied to high-dimensional data sets that, for instance,
+contain non-informative data dimensions or lie along a lower-dimensional
+manifold. We evaluate SSNL on a wide variety of experiments and show that it
+generally outperforms contemporary methods used in simulation-based inference,
+for instance, on a challenging real-world example from astrophysics which
+models the magnetic field strength of the sun using a solar dynamo model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Counterfactual Safety Margin Perspective on the Scoring of Autonomous
+  Vehicles' Riskiness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Zanardi, Andrea Censi, Margherita Atzei, Luigi Di Lillo, Emilio Frazzoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous Vehicles (AVs) have the potential to provide numerous societal
+benefits, such as decreased road accidents and increased overall transportation
+efficiency. However, quantifying the risk associated with AVs is challenging
+due to the lack of historical data and the rapidly evolving technology. This
+paper presents a data-driven framework for comparing the risk of different AVs'
+behaviors in various operational design domains (ODDs), based on counterfactual
+simulations of "misbehaving" road users. We introduce the concept of
+counterfactual safety margin, which represents the minimum deviation from
+normal behavior that could lead to a collision. This concept helps to find the
+most critical scenarios but also to assess the frequency and severity of risk
+of AVs. We show that the proposed methodology is applicable even when the AV's
+behavioral policy is unknown -- through worst- and best-case analyses -- making
+the method useful also to external third-party risk assessors. Our experimental
+results demonstrate the correlation between the safety margin, the driving
+policy quality, and the ODD shedding light on the relative risk associated with
+different AV providers. This work contributes to AV safety assessment and aids
+in addressing legislative and insurance concerns surrounding this emerging
+technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing the Distance between unbalanced Distributions -- The flat
+  Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henri Schmidt, Christian Düll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide an implementation to compute the flat metric in any dimension. The
+flat metric, also called dual bounded Lipschitz distance, generalizes the
+well-known Wasserstein distance W1 to the case that the distributions are of
+unequal total mass. This is of particular interest for unbalanced optimal
+transport tasks and for the analysis of data distributions where the sample
+size is important or normalization is not possible. The core of the method is
+based on a neural network to determine on optimal test function realizing the
+distance between two given measures. Special focus was put on achieving
+comparability of pairwise computed distances from independently trained
+networks. We tested the quality of the output in several experiments where
+ground truth was available as well as with simulated data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Three Factors to Improve Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunjun Choi, JaeHo Chung, Hawook Jeong, Jin Young Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the problem of out-of-distribution (OOD) detection, the usage of auxiliary
+data as outlier data for fine-tuning has demonstrated encouraging performance.
+However, previous methods have suffered from a trade-off between classification
+accuracy (ACC) and OOD detection performance (AUROC, FPR, AUPR). To improve
+this trade-off, we make three contributions: (i) Incorporating a self-knowledge
+distillation loss can enhance the accuracy of the network; (ii) Sampling
+semi-hard outlier data for training can improve OOD detection performance with
+minimal impact on accuracy; (iii) The introduction of our novel supervised
+contrastive learning can simultaneously improve OOD detection performance and
+the accuracy of the network. By incorporating all three factors, our approach
+enhances both accuracy and OOD detection performance by addressing the
+trade-off between classification and OOD detection. Our method achieves
+improvements over previous approaches in both performance metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximizing Success Rate of Payment Routing using Non-stationary Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aayush Chaudhary, Abhinav Rai, Abhishek Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper discusses the system architecture design and deployment of
+non-stationary multi-armed bandit approaches to determine a near-optimal
+payment routing policy based on the recent history of transactions. We propose
+a Routing Service architecture using a novel Ray-based implementation for
+optimally scaling bandit-based payment routing to over 10000 transactions per
+second, adhering to the system design requirements and ecosystem constraints
+with Payment Card Industry Data Security Standard (PCI DSS). We first evaluate
+the effectiveness of multiple bandit-based payment routing algorithms on a
+custom simulator to benchmark multiple non-stationary bandit approaches and
+identify the best hyperparameters. We then conducted live experiments on the
+payment transaction system on a fantasy sports platform Dream11. In the live
+experiments, we demonstrated that our non-stationary bandit-based algorithm
+consistently improves the success rate of transactions by 0.92\% compared to
+the traditional rule-based methods over one month.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 Pages, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Representation Learning for Periodic Time Series with Floss: A
+  Frequency Domain Regularization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunwei Yang, Xiaoxu Chen, Lijun Sun, Hongyu Yang, Yuankai Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series analysis is a fundamental task in various application domains,
+and deep learning approaches have demonstrated remarkable performance in this
+area. However, many real-world time series data exhibit significant periodic or
+quasi-periodic dynamics that are often not adequately captured by existing deep
+learning-based solutions. This results in an incomplete representation of the
+underlying dynamic behaviors of interest. To address this gap, we propose an
+unsupervised method called Floss that automatically regularizes learned
+representations in the frequency domain. The Floss method first automatically
+detects major periodicities from the time series. It then employs periodic
+shift and spectral density similarity measures to learn meaningful
+representations with periodic consistency. In addition, Floss can be easily
+incorporated into both supervised, semi-supervised, and unsupervised learning
+frameworks. We conduct extensive experiments on common time series
+classification, forecasting, and anomaly detection tasks to demonstrate the
+effectiveness of Floss. We incorporate Floss into several representative deep
+learning solutions to justify our design choices and demonstrate that it is
+capable of automatically discovering periodic dynamics and improving
+state-of-the-art deep learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MDT3D: Multi-<span class="highlight-title">Dataset</span> Training for LiDAR 3D Object Detection
+  Generalization <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Soum-Fontez, Jean-Emmanuel Deschaud, François Goulette
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised 3D Object Detection models have been displaying increasingly
+better performance in single-domain cases where the training data comes from
+the same environment and sensor as the testing data. However, in real-world
+scenarios data from the target domain may not be available for finetuning or
+for domain adaptation methods. Indeed, 3D object detection models trained on a
+source dataset with a specific point distribution have shown difficulties in
+generalizing to unseen datasets. Therefore, we decided to leverage the
+information available from several annotated source datasets with our
+Multi-Dataset Training for 3D Object Detection (MDT3D) method to increase the
+robustness of 3D object detection models when tested in a new environment with
+a different sensor configuration. To tackle the labelling gap between datasets,
+we used a new label mapping based on coarse labels. Furthermore, we show how we
+managed the mix of datasets during training and finally introduce a new
+cross-dataset augmentation method: cross-dataset object injection. We
+demonstrate that this training paradigm shows improvements for different types
+of 3D object detection models. The source code and additional results for this
+research project will be publicly available on GitHub for interested parties to
+access and utilize: https://github.com/LouisSF/MDT3D
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Synthetic Data for Data Imbalance Problems: Baselines from a
+  Data Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moon Ye-Bin, Nam Hyeon-Woo, Wonseok Choi, Nayeong Kim, Suha Kwak, Tae-Hyun Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We live in a vast ocean of data, and deep neural networks are no exception to
+this. However, this data exhibits an inherent phenomenon of imbalance. This
+imbalance poses a risk of deep neural networks producing biased predictions,
+leading to potentially severe ethical and social consequences. To address these
+challenges, we believe that the use of generative models is a promising
+approach for comprehending tasks, given the remarkable advancements
+demonstrated by recent diffusion models in generating high-quality images. In
+this work, we propose a simple yet effective baseline, SYNAuG, that utilizes
+synthetic data as a preliminary step before employing task-specific algorithms
+to address data imbalance problems. This straightforward approach yields
+impressive performance on datasets such as CIFAR100-LT, ImageNet100-LT,
+UTKFace, and Waterbird, surpassing the performance of existing task-specific
+methods. While we do not claim that our approach serves as a complete solution
+to the problem of data imbalance, we argue that supplementing the existing data
+with synthetic data proves to be an effective and crucial preliminary step in
+addressing data imbalance concerns.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wasserstein Diversity-Enriched Regularizer for Hierarchical
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haorui Li, Jiaqi Liang, Linjing Li, Daniel Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hierarchical reinforcement learning composites subpolicies in different
+hierarchies to accomplish complex tasks.Automated subpolicies discovery, which
+does not depend on domain knowledge, is a promising approach to generating
+subpolicies.However, the degradation problem is a challenge that existing
+methods can hardly deal with due to the lack of consideration of diversity or
+the employment of weak regularizers. In this paper, we propose a novel
+task-agnostic regularizer called the Wasserstein Diversity-Enriched Regularizer
+(WDER), which enlarges the diversity of subpolicies by maximizing the
+Wasserstein distances among action distributions. The proposed WDER can be
+easily incorporated into the loss function of existing methods to boost their
+performance further.Experimental results demonstrate that our WDER improves
+performance and sample efficiency in comparison with prior work without
+modifying hyperparameters, which indicates the applicability and robustness of
+the WDER.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Certified Multi-Fidelity Zeroth-Order Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Étienne de Montbrun, Sébastien Gerchinovitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of multi-fidelity zeroth-order optimization, where
+one can evaluate a function $f$ at various approximation levels (of varying
+costs), and the goal is to optimize $f$ with the cheapest evaluations possible.
+In this paper, we study \emph{certified} algorithms, which are additionally
+required to output a data-driven upper bound on the optimization error. We
+first formalize the problem in terms of a min-max game between an algorithm and
+an evaluation environment. We then propose a certified variant of the MFDOO
+algorithm and derive a bound on its cost complexity for any Lipschitz function
+$f$. We also prove an $f$-dependent lower bound showing that this algorithm has
+a near-optimal cost complexity. We close the paper by addressing the special
+case of noisy (stochastic) evaluations as a direct example.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Homomorphic Encryption and Trusted Execution Technology for
+  Autonomous and Confidential Model Refining in Cloud 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pinglan Liu, Wensheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the popularity of cloud computing and machine learning, it has been a
+trend to outsource machine learning processes (including model training and
+model-based inference) to cloud. By the outsourcing, other than utilizing the
+extensive and scalable resource offered by the cloud service provider, it will
+also be attractive to users if the cloud servers can manage the machine
+learning processes autonomously on behalf of the users. Such a feature will be
+especially salient when the machine learning is expected to be a long-term
+continuous process and the users are not always available to participate. Due
+to security and privacy concerns, it is also desired that the autonomous
+learning preserves the confidentiality of users' data and models involved.
+Hence, in this paper, we aim to design a scheme that enables autonomous and
+confidential model refining in cloud. Homomorphic encryption and trusted
+execution environment technology can protect confidentiality for autonomous
+computation, but each of them has their limitations respectively and they are
+complementary to each other. Therefore, we further propose to integrate these
+two techniques in the design of the model refining scheme. Through
+implementation and experiments, we evaluate the feasibility of our proposed
+scheme. The results indicate that, with our proposed scheme the cloud server
+can autonomously refine an encrypted model with newly provided encrypted
+training data to continuously improve its accuracy. Though the efficiency is
+still significantly lower than the baseline scheme that refines plaintext-model
+with plaintext-data, we expect that it can be improved by fully utilizing the
+higher level of parallelism and the computational power of GPU at the cloud
+server.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE INTERNATIONAL CONFERENCE ON CLOUD COMPUTING (CLOUD) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Inference with Differentially Private (Clustered) Outcomes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adel Javanmard, Vahab Mirrokni, Jean Pouget-Abadie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating causal effects from randomized experiments is only feasible if
+participants agree to reveal their potentially sensitive responses. Of the many
+ways of ensuring privacy, label differential privacy is a widely used measure
+of an algorithm's privacy guarantee, which might encourage participants to
+share responses without running the risk of de-anonymization. Many
+differentially private mechanisms inject noise into the original data-set to
+achieve this privacy guarantee, which increases the variance of most
+statistical estimators and makes the precise measurement of causal effects
+difficult: there exists a fundamental privacy-variance trade-off to performing
+causal analyses from differentially private data. With the aim of achieving
+lower variance for stronger privacy guarantees, we suggest a new differential
+privacy mechanism, "Cluster-DP", which leverages any given cluster structure of
+the data while still allowing for the estimation of causal effects. We show
+that, depending on an intuitive measure of cluster quality, we can improve the
+variance loss while maintaining our privacy guarantees. We compare its
+performance, theoretically and empirically, to that of its unclustered version
+and a more extreme uniform-prior version which does not use any of the original
+response distribution, both of which are special cases of the "Cluster-DP"
+algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curriculum Guided Domain Adaptation in the Dark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chowdhury Sadman Jahan, Andreas Savakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing the rising concerns of privacy and security, domain adaptation in
+the dark aims to adapt a black-box source trained model to an unlabeled target
+domain without access to any source data or source model parameters. The need
+for domain adaptation of black-box predictors becomes even more pronounced to
+protect intellectual property as deep learning based solutions are becoming
+increasingly commercialized. Current methods distill noisy predictions on the
+target data obtained from the source model to the target model, and/or separate
+clean/noisy target samples before adapting using traditional noisy label
+learning algorithms. However, these methods do not utilize the easy-to-hard
+learning nature of the clean/noisy data splits. Also, none of the existing
+methods are end-to-end, and require a separate fine-tuning stage and an initial
+warmup stage. In this work, we present Curriculum Adaptation for Black-Box
+(CABB) which provides a curriculum guided adaptation approach to gradually
+train the target model, first on target data with high confidence (clean)
+labels, and later on target data with noisy labels. CABB utilizes
+Jensen-Shannon divergence as a better criterion for clean-noisy sample
+separation, compared to the traditional criterion of cross entropy loss. Our
+method utilizes co-training of a dual-branch network to suppress error
+accumulation resulting from confirmation bias. The proposed approach is
+end-to-end trainable and does not require any extra finetuning stage, unlike
+existing methods. Empirical results on standard domain adaptation datasets show
+that CABB outperforms existing state-of-the-art black-box DA models and is
+comparable to white-box domain adaptation models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Sparse to Soft Mixtures of Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joan Puigcerver, Carlos Riquelme, Basil Mustafa, Neil Houlsby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse mixture of expert architectures (MoEs) scale model capacity without
+large increases in training or inference costs. Despite their success, MoEs
+suffer from a number of issues: training instability, token dropping, inability
+to scale the number of experts, or ineffective finetuning. In this work, we
+proposeSoft MoE, a fully-differentiable sparse Transformer that addresses these
+challenges, while maintaining the benefits of MoEs. Soft MoE performs an
+implicit soft assignment by passing different weighted combinations of all
+input tokens to each expert. As in other MoE works, experts in Soft MoE only
+process a subset of the (combined) tokens, enabling larger model capacity at
+lower inference cost. In the context of visual recognition, Soft MoE greatly
+outperforms standard Transformers (ViTs) and popular MoE variants (Tokens
+Choice and Experts Choice). For example, Soft MoE-Base/16 requires 10.5x lower
+inference cost (5.7x lower wall-clock time) than ViT-Huge/14 while matching its
+performance after similar training. Soft MoE also scales well: Soft MoE Huge/14
+with 128 experts in 16 MoE layers has over 40x more parameters than ViT
+Huge/14, while inference time cost grows by only 2%, and it performs
+substantially better.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposing and Coupling Saliency Map for Lesion Segmentation in
+  Ultrasound Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyuan Ning, Yixiao Mao, Qianjin Feng, Shengzhou Zhong, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex scenario of ultrasound image, in which adjacent tissues (i.e.,
+background) share similar intensity with and even contain richer texture
+patterns than lesion region (i.e., foreground), brings a unique challenge for
+accurate lesion segmentation. This work presents a decomposition-coupling
+network, called DC-Net, to deal with this challenge in a
+(foreground-background) saliency map disentanglement-fusion manner. The DC-Net
+consists of decomposition and coupling subnets, and the former preliminarily
+disentangles original image into foreground and background saliency maps,
+followed by the latter for accurate segmentation under the assistance of
+saliency prior fusion. The coupling subnet involves three aspects of fusion
+strategies, including: 1) regional feature aggregation (via differentiable
+context pooling operator in the encoder) to adaptively preserve local
+contextual details with the larger receptive field during dimension reduction;
+2) relation-aware representation fusion (via cross-correlation fusion module in
+the decoder) to efficiently fuse low-level visual characteristics and
+high-level semantic features during resolution restoration; 3) dependency-aware
+prior incorporation (via coupler) to reinforce foreground-salient
+representation with the complementary information derived from background
+representation. Furthermore, a harmonic loss function is introduced to
+encourage the network to focus more attention on low-confidence and hard
+samples. The proposed method is evaluated on two ultrasound lesion segmentation
+tasks, which demonstrates the remarkable performance improvement over existing
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the use of deep learning for phase recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiqiang Wang, Li Song, Chutian Wang, Zhenbo Ren, Guangyuan Zhao, Jiazhen Dou, Jianglei Di, George Barbastathis, Renjie Zhou, Jianlin Zhao, Edmund Y. Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phase recovery (PR) refers to calculating the phase of the light field from
+its intensity measurements. As exemplified from quantitative phase imaging and
+coherent diffraction imaging to adaptive optics, PR is essential for
+reconstructing the refractive index distribution or topography of an object and
+correcting the aberration of an imaging system. In recent years, deep learning
+(DL), often implemented through deep neural networks, has provided
+unprecedented support for computational imaging, leading to more efficient
+solutions for various PR problems. In this review, we first briefly introduce
+conventional methods for PR. Then, we review how DL provides support for PR
+from the following three stages, namely, pre-processing, in-processing, and
+post-processing. We also review how DL is used in phase image processing.
+Finally, we summarize the work in DL for PR and outlook on how to better use DL
+to improve the reliability and efficiency in PR. Furthermore, we present a
+live-updating resource (https://github.com/kqwang/phase-recovery) for readers
+to learn more about PR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>82 pages, 32 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QUANT: A Minimalist Interval Method for Time Series Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angus Dempster, Daniel F. Schmidt, Geoffrey I. Webb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that it is possible to achieve the same accuracy, on average, as the
+most accurate existing interval methods for time series classification on a
+standard set of benchmark datasets using a single type of feature (quantiles),
+fixed intervals, and an 'off the shelf' classifier. This distillation of
+interval-based approaches represents a fast and accurate method for time series
+classification, achieving state-of-the-art accuracy on the expanded set of 142
+datasets in the UCR archive with a total compute time (training and inference)
+of less than 15 minutes using a single CPU core.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Domain Adaptation on Aerial Images under Gradually Degrading
+  Weather 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chowdhury Sadman Jahan, Andreas Savakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation (DA) strives to mitigate the domain gap between the source
+domain where a model is trained, and the target domain where the model is
+deployed. When a deep learning model is deployed on an aerial platform, it may
+face gradually degrading weather conditions during operation, leading to
+widening domain gaps between the training data and the encountered evaluation
+data. We synthesize two such gradually worsening weather conditions on real
+images from two existing aerial imagery datasets, generating a total of four
+benchmark datasets. Under the continual, or test-time adaptation setting, we
+evaluate three DA models on our datasets: a baseline standard DA model and two
+continual DA models. In such setting, the models can access only one small
+portion, or one batch of the target data at a time, and adaptation takes place
+continually, and over only one epoch of the data. The combination of the
+constraints of continual adaptation, and gradually deteriorating weather
+conditions provide the practical DA scenario for aerial deployment. Among the
+evaluated models, we consider both convolutional and transformer architectures
+for comparison. We discover stability issues during adaptation for existing
+buffer-fed continual DA methods, and offer gradient normalization as a simple
+solution to curb training instability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual histological staining of unlabeled autopsy tissue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhu Li, Nir Pillar, Jingxi Li, Tairan Liu, Di Wu, Songyu Sun, Guangdong Ma, Kevin de Haan, Luzhe Huang, Sepehr Hamidi, Anatoly Urisman, Tal Keidar Haran, William Dean Wallace, Jonathan E. Zuckerman, Aydogan Ozcan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histological examination is a crucial step in an autopsy; however, the
+traditional histochemical staining of post-mortem samples faces multiple
+challenges, including the inferior staining quality due to autolysis caused by
+delayed fixation of cadaver tissue, as well as the resource-intensive nature of
+chemical staining procedures covering large tissue areas, which demand
+substantial labor, cost, and time. These challenges can become more pronounced
+during global health crises when the availability of histopathology services is
+limited, resulting in further delays in tissue fixation and more severe
+staining artifacts. Here, we report the first demonstration of virtual staining
+of autopsy tissue and show that a trained neural network can rapidly transform
+autofluorescence images of label-free autopsy tissue sections into brightfield
+equivalent images that match hematoxylin and eosin (H&E) stained versions of
+the same samples, eliminating autolysis-induced severe staining artifacts
+inherent in traditional histochemical staining of autopsied tissue. Our virtual
+H&E model was trained using >0.7 TB of image data and a data-efficient
+collaboration scheme that integrates the virtual staining network with an image
+registration network. The trained model effectively accentuated nuclear,
+cytoplasmic and extracellular features in new autopsy tissue samples that
+experienced severe autolysis, such as COVID-19 samples never seen before, where
+the traditional histochemical staining failed to provide consistent staining
+quality. This virtual autopsy staining technique can also be extended to
+necrotic tissue, and can rapidly and cost-effectively generate artifact-free
+H&E stains despite severe autolysis and cell death, also reducing labor, cost
+and infrastructure requirements associated with the standard histochemical
+staining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 Pages, 7 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VLUCI: Variational Learning of Unobserved Confounders for Counterfactual
+  Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghe Zhao, Qiang Huang, Siwei Wu, Yun Peng, Huiyan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference plays a vital role in diverse domains like epidemiology,
+healthcare, and economics. De-confounding and counterfactual prediction in
+observational data has emerged as a prominent concern in causal inference
+research. While existing models tackle observed confounders, the presence of
+unobserved confounders remains a significant challenge, distorting causal
+inference and impacting counterfactual outcome accuracy. To address this, we
+propose a novel variational learning model of unobserved confounders for
+counterfactual inference (VLUCI), which generates the posterior distribution of
+unobserved confounders. VLUCI relaxes the unconfoundedness assumption often
+overlooked by most causal inference methods. By disentangling observed and
+unobserved confounders, VLUCI constructs a doubly variational inference model
+to approximate the distribution of unobserved confounders, which are used for
+inferring more accurate counterfactual outcomes. Extensive experiments on
+synthetic and semi-synthetic datasets demonstrate VLUCI's superior performance
+in inferring unobserved confounders. It is compatible with state-of-the-art
+counterfactual inference models, significantly improving inference accuracy at
+both group and individual levels. Additionally, VLUCI provides confidence
+intervals for counterfactual outcomes, aiding decision-making in risk-sensitive
+domains. We further clarify the considerations when applying VLUCI to cases
+where unobserved confounders don't strictly conform to our model assumptions
+using the public IHDP dataset as an example, highlighting the practical
+advantages of VLUCI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User-Controllable Recommendation via Counterfactual Retrospective and
+  Prospective Explanations <span class="chip">ECAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juntao Tan, Yingqiang Ge, Yan Zhu, Yinglong Xia, Jiebo Luo, Jianchao Ji, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern recommender systems utilize users' historical behaviors to generate
+personalized recommendations. However, these systems often lack user
+controllability, leading to diminished user satisfaction and trust in the
+systems. Acknowledging the recent advancements in explainable recommender
+systems that enhance users' understanding of recommendation mechanisms, we
+propose leveraging these advancements to improve user controllability. In this
+paper, we present a user-controllable recommender system that seamlessly
+integrates explainability and controllability within a unified framework. By
+providing both retrospective and prospective explanations through
+counterfactual reasoning, users can customize their control over the system by
+interacting with these explanations.
+  Furthermore, we introduce and assess two attributes of controllability in
+recommendation systems: the complexity of controllability and the accuracy of
+controllability. Experimental evaluations on MovieLens and Yelp datasets
+substantiate the effectiveness of our proposed framework. Additionally, our
+experiments demonstrate that offering users control options can potentially
+enhance recommendation accuracy in the future. Source code and data are
+available at \url{https://github.com/chrisjtan/ucr}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at 26th European Conference on Artificial
+  Intelligence (ECAI2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tango: rethinking quantization for graph neural network training on GPUs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyang Chen, Da Zheng, Caiwen Ding, Chengying Huan, Yuede Ji, Hang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are becoming increasingly popular due to their
+superior performance in critical graph-related tasks. While quantization is
+widely used to accelerate GNN computation, quantized training faces
+unprecedented challenges. Current quantized GNN training systems often have
+longer training times than their full-precision counterparts for two reasons:
+(i) addressing the accuracy challenge leads to excessive overhead, and (ii) the
+optimization potential exposed by quantization is not adequately leveraged.
+This paper introduces Tango which re-thinks quantization challenges and
+opportunities for graph neural network training on GPUs with three
+contributions: Firstly, we introduce efficient rules to maintain accuracy
+during quantized GNN training. Secondly, we design and implement
+quantization-aware primitives and inter-primitive optimizations that can speed
+up GNN training. Finally, we integrate Tango with the popular Deep Graph
+Library (DGL) system and demonstrate its superior performance over
+state-of-the-art approaches on various GNN models and datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Factor Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Zhang, Mohammed Haroon Dupty, Fan Wu, Javen Qinfeng Shi, Wee Sun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, we have witnessed a surge of Graph Neural Networks (GNNs),
+most of which can learn powerful representations in an end-to-end fashion with
+great success in many real-world applications. They have resemblance to
+Probabilistic Graphical Models (PGMs), but break free from some limitations of
+PGMs. By aiming to provide expressive methods for representation learning
+instead of computing marginals or most likely configurations, GNNs provide
+flexibility in the choice of information flowing rules while maintaining good
+performance. Despite their success and inspirations, they lack efficient ways
+to represent and learn higher-order relations among variables/nodes. More
+expressive higher-order GNNs which operate on k-tuples of nodes need increased
+computational resources in order to process higher-order tensors. We propose
+Factor Graph Neural Networks (FGNNs) to effectively capture higher-order
+relations for inference and learning. To do so, we first derive an efficient
+approximate Sum-Product loopy belief propagation inference algorithm for
+discrete higher-order PGMs. We then neuralize the novel message passing scheme
+into a Factor Graph Neural Network (FGNN) module by allowing richer
+representations of the message update rules; this facilitates both efficient
+inference and powerful end-to-end learning. We further show that with a
+suitable choice of message aggregation operators, our FGNN is also able to
+represent Max-Product belief propagation, providing a single family of
+architecture that can represent both Max and Sum-Product loopy belief
+propagation. Our extensive experimental evaluation on synthetic as well as real
+datasets demonstrates the potential of the proposed model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by JMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Machine Learning Performance with Continuous In-Session Ground
+  Truth Scores: Pilot Study on Objective Skeletal Muscle Pain Intensity
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boluwatife E. Faremi, Jonathon Stavres, Nuno Oliveira, Zhaoxian Zhou, Andrew H. Sung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) models trained on subjective self-report scores
+struggle to objectively classify pain accurately due to the significant
+variance between real-time pain experiences and recorded scores afterwards.
+This study developed two devices for acquisition of real-time, continuous
+in-session pain scores and gathering of ANS-modulated endodermal activity
+(EDA).The experiment recruited N = 24 subjects who underwent a post-exercise
+circulatory occlusion (PECO) with stretch, inducing discomfort. Subject data
+were stored in a custom pain platform, facilitating extraction of time-domain
+EDA features and in-session ground truth scores. Moreover, post-experiment
+visual analog scale (VAS) scores were collected from each subject. Machine
+learning models, namely Multi-layer Perceptron (MLP) and Random Forest (RF),
+were trained using corresponding objective EDA features combined with
+in-session scores and post-session scores, respectively. Over a 10-fold
+cross-validation, the macro-averaged geometric mean score revealed MLP and RF
+models trained with objective EDA features and in-session scores achieved
+superior performance (75.9% and 78.3%) compared to models trained with
+post-session scores (70.3% and 74.6%) respectively. This pioneering study
+demonstrates that using continuous in-session ground truth scores significantly
+enhances ML performance in pain intensity characterization, overcoming ground
+truth sparsity-related issues, data imbalance, and high variance. This study
+informs future objective-based ML pain system training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 2-page Appendix, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Machine Learning for Discovery: Statistical Challenges \&
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Genevera I. Allen, Luqin Gan, Lili Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New technologies have led to vast troves of large and complex datasets across
+many scientific domains and industries. People routinely use machine learning
+techniques to not only process, visualize, and make predictions from this big
+data, but also to make data-driven discoveries. These discoveries are often
+made using Interpretable Machine Learning, or machine learning models and
+techniques that yield human understandable insights. In this paper, we discuss
+and review the field of interpretable machine learning, focusing especially on
+the techniques as they are often employed to generate new knowledge or make
+discoveries from large data sets. We outline the types of discoveries that can
+be made using Interpretable Machine Learning in both supervised and
+unsupervised settings. Additionally, we focus on the grand challenge of how to
+validate these discoveries in a data-driven manner, which promotes trust in
+machine learning systems and reproducibility in science. We discuss validation
+from both a practical perspective, reviewing approaches based on data-splitting
+and stability, as well as from a theoretical perspective, reviewing statistical
+results on model selection consistency and uncertainty quantification via
+statistical inference. Finally, we conclude by highlighting open challenges in
+using interpretable machine learning techniques to make discoveries, including
+gaps between theory and practice for validating data-driven-discoveries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reverse Stable Diffusion: What <span class="highlight-title">prompt</span> was used to generate this image? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, Mubarak Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models such as Stable Diffusion have recently
+attracted the interest of many researchers, and inverting the diffusion process
+can play an important role in better understanding the generative process and
+how to engineer prompts in order to obtain the desired images. To this end, we
+introduce the new task of predicting the text prompt given an image generated
+by a generative diffusion model. We combine a series of white-box and black-box
+models (with and without access to the weights of the diffusion network) to
+deal with the proposed task. We propose a novel learning framework comprising
+of a joint prompt regression and multi-label vocabulary classification
+objective that generates improved prompts. To further improve our method, we
+employ a curriculum learning procedure that promotes the learning of
+image-prompt pairs with lower labeling noise (i.e. that are better aligned),
+and an unsupervised domain-adaptive kernel learning method that uses the
+similarities between samples in the source and target domains as extra
+features. We conduct experiments on the DiffusionDB data set, predicting text
+prompts from images generated by Stable Diffusion. Our novel learning framework
+produces excellent results on the aforementioned task, yielding the highest
+gains when applied on the white-box model. In addition, we make an interesting
+discovery: training a diffusion model on the prompt generation task can make
+the model generate images that are much better aligned with the input prompts,
+when the model is directly reused for text-to-image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit Occupancy Flow Fields for Perception and Prediction in
+  Self-Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Agro, Quinlan Sykora, Sergio Casas, Raquel Urtasun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A self-driving vehicle (SDV) must be able to perceive its surroundings and
+predict the future behavior of other traffic participants. Existing works
+either perform object detection followed by trajectory forecasting of the
+detected objects, or predict dense occupancy and flow grids for the whole
+scene. The former poses a safety concern as the number of detections needs to
+be kept low for efficiency reasons, sacrificing object recall. The latter is
+computationally expensive due to the high-dimensionality of the output grid,
+and suffers from the limited receptive field inherent to fully convolutional
+networks. Furthermore, both approaches employ many computational resources
+predicting areas or objects that might never be queried by the motion planner.
+This motivates our unified approach to perception and future prediction that
+implicitly represents occupancy and flow over time with a single neural
+network. Our method avoids unnecessary computation, as it can be directly
+queried by the motion planner at continuous spatio-temporal locations.
+Moreover, we design an architecture that overcomes the limited receptive field
+of previous explicit occupancy prediction methods by adding an efficient yet
+effective global attention mechanism. Through extensive experiments in both
+urban and highway settings, we demonstrate that our implicit model outperforms
+the current state-of-the-art. For more information, visit the project website:
+https://waabi.ai/research/implicito.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VertexSerum: Poisoning Graph Neural Networks for Link Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruyi Ding, Shijin Duan, Xiaolin Xu, Yunsi Fei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have brought superb performance to various
+applications utilizing graph structural data, such as social analysis and fraud
+detection. The graph links, e.g., social relationships and transaction history,
+are sensitive and valuable information, which raises privacy concerns when
+using GNNs. To exploit these vulnerabilities, we propose VertexSerum, a novel
+graph poisoning attack that increases the effectiveness of graph link stealing
+by amplifying the link connectivity leakage. To infer node adjacency more
+accurately, we propose an attention mechanism that can be embedded into the
+link detection network. Our experiments demonstrate that VertexSerum
+significantly outperforms the SOTA link inference attack, improving the AUC
+scores by an average of $9.8\%$ across four real-world datasets and three
+different GNN structures. Furthermore, our experiments reveal the effectiveness
+of VertexSerum in both black-box and online learning settings, further
+validating its applicability in real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A digital twin framework for civil engineering structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Torzoni, Marco Tezzele, Stefano Mariani, Andrea Manzoni, Karen E. Willcox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The digital twin concept represents an appealing opportunity to advance
+condition-based and predictive maintenance paradigms for civil engineering
+systems, thus allowing reduced lifecycle costs, increased system safety, and
+increased system availability. This work proposes a predictive digital twin
+approach to the health monitoring, maintenance, and management planning of
+civil engineering structures. The asset-twin coupled dynamical system is
+encoded employing a probabilistic graphical model, which allows all relevant
+sources of uncertainty to be taken into account. In particular, the
+time-repeating observations-to-decisions flow is modeled using a dynamic
+Bayesian network. Real-time structural health diagnostics are provided by
+assimilating sensed data with deep learning models. The digital twin state is
+continually updated in a sequential Bayesian inference fashion. This is then
+exploited to inform the optimal planning of maintenance and management actions
+within a dynamic decision-making framework. A preliminary offline phase
+involves the population of training datasets through a reduced-order numerical
+model and the computation of a health-dependent control policy. The strategy is
+assessed on two synthetic case studies, involving a cantilever beam and a
+railway bridge, demonstrating the dynamic decision-making capabilities of
+health-aware digital twins.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Novel Physics-Based Machine-Learning Models for Indoor Air Quality
+  Approximations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmad Mohammadshirazi, Aida Nadafian, Amin Karimi Monsefi, Mohammad H. Rafiei, Rajiv Ramnath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cost-effective sensors are capable of real-time capturing a variety of air
+quality-related modalities from different pollutant concentrations to
+indoor/outdoor humidity and temperature. Machine learning (ML) models are
+capable of performing air-quality "ahead-of-time" approximations. Undoubtedly,
+accurate indoor air quality approximation significantly helps provide a healthy
+indoor environment, optimize associated energy consumption, and offer human
+comfort. However, it is crucial to design an ML architecture to capture the
+domain knowledge, so-called problem physics. In this study, we propose six
+novel physics-based ML models for accurate indoor pollutant concentration
+approximations. The proposed models include an adroit combination of
+state-space concepts in physics, Gated Recurrent Units, and Decomposition
+techniques. The proposed models were illustrated using data collected from five
+offices in a commercial building in California. The proposed models are shown
+to be less complex, computationally more efficient, and more accurate than
+similar state-of-the-art transformer-based models. The superiority of the
+proposed models is due to their relatively light architecture (computational
+efficiency) and, more importantly, their ability to capture the underlying
+highly nonlinear patterns embedded in the often contaminated sensor-collected
+indoor air quality temporal data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Price-Aware Deep Learning for Electricity Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Dvorkin, Ferdinando Fioretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep learning gradually penetrates operational planning, its inherent
+prediction errors may significantly affect electricity prices. This letter
+examines how prediction errors propagate into electricity prices, revealing
+notable pricing errors and their spatial disparity in congested power systems.
+To improve fairness, we propose to embed electricity market-clearing
+optimization as a deep learning layer. Differentiating through this layer
+allows for balancing between prediction and pricing errors, as oppose to
+minimizing prediction errors alone. This layer implicitly optimizes fairness
+and controls the spatial distribution of price errors across the system. We
+showcase the price-aware deep learning in the nexus of wind power forecasting
+and short-term electricity market clearing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COVID-VR: A Deep Learning COVID-19 Classification Model Using
+  Volume-Rendered Computer Tomography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noemi Maritza L. Romero, Ricco Vasconcellos, Mariana R. Mendoza, João L. D. Comba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic presented numerous challenges to healthcare systems
+worldwide. Given that lung infections are prevalent among COVID-19 patients,
+chest Computer Tomography (CT) scans have frequently been utilized as an
+alternative method for identifying COVID-19 conditions and various other types
+of pulmonary diseases. Deep learning architectures have emerged to automate the
+identification of pulmonary disease types by leveraging CT scan slices as
+inputs for classification models. This paper introduces COVID-VR, a novel
+approach for classifying pulmonary diseases based on volume rendering images of
+the lungs captured from multiple angles, thereby providing a comprehensive view
+of the entire lung in each image. To assess the effectiveness of our proposal,
+we compared it against competing strategies utilizing both private data
+obtained from partner hospitals and a publicly available dataset. The results
+demonstrate that our approach effectively identifies pulmonary lesions and
+performs competitively when compared to slice-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenFlamingo: An Open-Source Framework for Training Large Autoregressive
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce OpenFlamingo, a family of autoregressive vision-language models
+ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce
+an open-source replication of DeepMind's Flamingo models. On seven
+vision-language datasets, OpenFlamingo models average between 80 - 89% of
+corresponding Flamingo performance. This technical report describes our models,
+training data, hyperparameters, and evaluation suite. We share our models and
+code at https://github.com/mlfoundations/open_flamingo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Follow the Soldiers with Optimized Single-Shot Multibox Detection and
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jumman Hossain, Maliha Momtaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, autonomous cars are gaining traction due to their numerous
+potential applications on battlefields and in resolving a variety of other
+real-world challenges. The main goal of our project is to build an autonomous
+system using DeepRacer which will follow a specific person (for our project, a
+soldier) when they will be moving in any direction. Two main components to
+accomplish this project is an optimized Single-Shot Multibox Detection (SSD)
+object detection model and a Reinforcement Learning (RL) model. We accomplished
+the task using SSD Lite instead of SSD and at the end, compared the results
+among SSD, SSD with Neural Computing Stick (NCS), and SSD Lite. Experimental
+results show that SSD Lite gives better performance among these three
+techniques and exhibits a considerable boost in inference speed (~2-3 times)
+without compromising accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of Chat<span class="highlight-title">GPT</span>-like
+  Models at All Scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT-like models have revolutionized various applications in artificial
+intelligence, from summarization and coding to translation, matching or even
+surpassing human performance. However, the current landscape lacks an
+accessible, efficient, and cost-effective end-to-end RLHF (Reinforcement
+Learning with Human Feedback) training pipeline for these powerful models,
+particularly when training at the scale of billions of parameters. This paper
+introduces DeepSpeed-Chat, a novel system that democratizes RLHF training,
+making it accessible to the AI community. DeepSpeed-Chat offers three key
+capabilities: an easy-to-use training and inference experience for ChatGPT-like
+models, a DeepSpeed-RLHF pipeline that replicates the training pipeline from
+InstructGPT, and a robust DeepSpeed-RLHF system that combines various
+optimizations for training and inference in a unified way. The system delivers
+unparalleled efficiency and scalability, enabling training of models with
+hundreds of billions of parameters in record time and at a fraction of the
+cost. With this development, DeepSpeed-Chat paves the way for broader access to
+advanced RLHF training, even for data scientists with limited resources,
+thereby fostering innovation and further development in the field of AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational Long Exposure Mobile Photography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Tabellion, Nikhil Karnad, Noa Glaser, Ben Weiss, David E. Jacobs, Yael Pritch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long exposure photography produces stunning imagery, representing moving
+elements in a scene with motion-blur. It is generally employed in two
+modalities, producing either a foreground or a background blur effect.
+Foreground blur images are traditionally captured on a tripod-mounted camera
+and portray blurred moving foreground elements, such as silky water or light
+trails, over a perfectly sharp background landscape. Background blur images,
+also called panning photography, are captured while the camera is tracking a
+moving subject, to produce an image of a sharp subject over a background
+blurred by relative motion. Both techniques are notoriously challenging and
+require additional equipment and advanced skills. In this paper, we describe a
+computational burst photography system that operates in a hand-held smartphone
+camera app, and achieves these effects fully automatically, at the tap of the
+shutter button. Our approach first detects and segments the salient subject. We
+track the scene motion over multiple frames and align the images in order to
+preserve desired sharpness and to produce aesthetically pleasing motion
+streaks. We capture an under-exposed burst and select the subset of input
+frames that will produce blur trails of controlled length, regardless of scene
+or camera motion velocity. We predict inter-frame motion and synthesize
+motion-blur to fill the temporal gaps between the input frames. Finally, we
+composite the blurred image with the sharp regular exposure to protect the
+sharpness of faces or areas of the scene that are barely moving, and produce a
+final high resolution and high dynamic range (HDR) photograph. Our system
+democratizes a capability previously reserved to professionals, and makes this
+creative style accessible to most casual photographers.
+  More information and supplementary material can be found on our project
+webpage: https://motion-mode.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Deep Learning for Tumor Dynamic Modeling and Overall
+  Survival Prediction using Neural-ODE 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Laurie, James Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While tumor dynamic modeling has been widely applied to support the
+development of oncology drugs, there remains a need to increase predictivity,
+enable personalized therapy, and improve decision-making. We propose the use of
+Tumor Dynamic Neural-ODE (TDNODE) as a pharmacology-informed neural network to
+enable model discovery from longitudinal tumor size data. We show that TDNODE
+overcomes a key limitation of existing models in its ability to make unbiased
+predictions from truncated data. The encoder-decoder architecture is designed
+to express an underlying dynamical law which possesses the fundamental property
+of generalized homogeneity with respect to time. Thus, the modeling formalism
+enables the encoder output to be interpreted as kinetic rate metrics, with
+inverse time as the physical unit. We show that the generated metrics can be
+used to predict patients' overall survival (OS) with high accuracy. The
+proposed modeling formalism provides a principled way to integrate multimodal
+dynamical datasets in oncology disease modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 4 figures and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compressed and distributed least-squares regression: convergence rates
+  with applications to Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Constantin Philippenko, Aymeric Dieuleveut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the impact of compression on stochastic
+gradient algorithms for machine learning, a technique widely used in
+distributed and federated learning. We underline differences in terms of
+convergence rates between several unbiased compression operators, that all
+satisfy the same condition on their variance, thus going beyond the classical
+worst-case analysis. To do so, we focus on the case of least-squares regression
+(LSR) and analyze a general stochastic approximation algorithm for minimizing
+quadratic functions relying on a random field. We consider weak assumptions on
+the random field, tailored to the analysis (specifically, expected H\"older
+regularity), and on the noise covariance, enabling the analysis of various
+randomizing mechanisms, including compression. We then extend our results to
+the case of federated learning.
+  More formally, we highlight the impact on the convergence of the covariance
+$\mathfrak{C}_{\mathrm{ania}}$ of the additive noise induced by the algorithm.
+We demonstrate despite the non-regularity of the stochastic field, that the
+limit variance term scales with $\mathrm{Tr}(\mathfrak{C}_{\mathrm{ania}}
+H^{-1})/K$ (where $H$ is the Hessian of the optimization problem and $K$ the
+number of iterations) generalizing the rate for the vanilla LSR case where it
+is $\sigma^2 \mathrm{Tr}(H H^{-1}) / K = \sigma^2 d / K$ (Bach and Moulines,
+2013). Then, we analyze the dependency of $\mathfrak{C}_{\mathrm{ania}}$ on the
+compression strategy and ultimately its impact on convergence, first in the
+centralized case, then in two heterogeneous FL frameworks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EmbeddingTree: Hierarchical Exploration of Entity Features in Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Zheng, Junpeng Wang, Chin-Chia Michael Yeh, Yujie Fan, Huiyuan Chen, Liang Wang, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embedding learning transforms discrete data entities into continuous
+numerical representations, encoding features/properties of the entities.
+Despite the outstanding performance reported from different embedding learning
+algorithms, few efforts were devoted to structurally interpreting how features
+are encoded in the learned embedding space. This work proposes EmbeddingTree, a
+hierarchical embedding exploration algorithm that relates the semantics of
+entity features with the less-interpretable embedding vectors. An interactive
+visualization tool is also developed based on EmbeddingTree to explore
+high-dimensional embeddings. The tool helps users discover nuance features of
+data entities, perform feature denoising/injecting in embedding training, and
+generate embeddings for unseen entities. We demonstrate the efficacy of
+EmbeddingTree and our visualization tool through embeddings generated for
+industry-scale merchant data and the public 30Music listening/playlists
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, accepted by PacificVis 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Careful Whisper -- leveraging advances in automatic speech recognition
+  for robust and interpretable aphasia subtype classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurin Wagner, Mario Zusag, Theresa Bloder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a fully automated approach for identifying speech
+anomalies from voice recordings to aid in the assessment of speech impairments.
+By combining Connectionist Temporal Classification (CTC) and
+encoder-decoder-based automatic speech recognition models, we generate rich
+acoustic and clean transcripts. We then apply several natural language
+processing methods to extract features from these transcripts to produce
+prototypes of healthy speech. Basic distance measures from these prototypes
+serve as input features for standard machine learning classifiers, yielding
+human-level accuracy for the distinction between recordings of people with
+aphasia and a healthy control group. Furthermore, the most frequently occurring
+aphasia types can be distinguished with 90% accuracy. The pipeline is directly
+applicable to other diseases and languages, showing promise for robustly
+extracting diagnostic speech biomarkers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Detecting Harmful Agendas in News Articles <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00102v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00102v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melanie Subbiah, Amrita Bhattacharjee, Yilun Hua, Tharindu Kumarage, Huan Liu, Kathleen McKeown
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manipulated news online is a growing problem which necessitates the use of
+automated systems to curtail its spread. We argue that while misinformation and
+disinformation detection have been studied, there has been a lack of investment
+in the important open challenge of detecting harmful agendas in news articles;
+identifying harmful agendas is critical to flag news campaigns with the
+greatest potential for real world harm. Moreover, due to real concerns around
+censorship, harmful agenda detectors must be interpretable to be effective. In
+this work, we propose this new task and release a dataset, NewsAgendas, of
+annotated news articles for agenda identification. We show how interpretable
+systems can be effective on this task and demonstrate that they can perform
+comparably to black-box models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera-ready for ACL-WASSA 2023. First two authors contributed
+  equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowing When to Stop: Delay-Adaptive Spiking Neural Network Classifiers
+  with Reliability Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11322v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11322v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiechen Chen, Sangwoo Park, Osvaldo Simeone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) process time-series data via internal
+event-driven neural dynamics whose energy consumption depends on the number of
+spikes exchanged between neurons over the course of the input presentation.
+Typically, decisions are produced after the entire input sequence has been
+processed, resulting in latency and energy consumption levels that are fairly
+uniform across inputs. However, as explored in recent work, SNNs can produce an
+early decision when the SNN model is sufficiently ``confident'', adapting delay
+and energy consumption to the difficulty of each example. Existing techniques
+are based on heuristic measures of confidence that do not provide reliability
+guarantees, potentially exiting too early. In this paper, we introduce a novel
+delay-adaptive SNN-based inference methodology that, wrapping around any
+pre-trained SNN classifier, provides guaranteed reliability for the decisions
+produced at input-dependent stopping times. The approach, dubbed SpikeCP,
+leverages tools from conformal prediction (CP), and it entails minimal
+complexity increase as compared to the underlying SNN, requiring only
+additional thresholding and counting operations at run time. SpikeCP is also
+extended to integrate a CP-aware training phase that targets delay performance.
+Variants of CP based on alternative confidence correction schemes, from
+Bonferroni to Simes, are explored, and extensive experiments are described
+using the MNIST-DVS data set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sampling binary sparse coding QUBO models using a spiking neuromorphic
+  processor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Henke, Elijah Pelofske, Georg Hahn, Garrett T. Kenyon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of computing a sparse binary representation of an
+image. To be precise, given an image and an overcomplete, non-orthonormal
+basis, we aim to find a sparse binary vector indicating the minimal set of
+basis vectors that when added together best reconstruct the given input. We
+formulate this problem with an $L_2$ loss on the reconstruction error, and an
+$L_0$ (or, equivalently, an $L_1$) loss on the binary vector enforcing
+sparsity. This yields a so-called Quadratic Unconstrained Binary Optimization
+(QUBO) problem, whose solution is generally NP-hard to find. The contribution
+of this work is twofold. First, the method of unsupervised and unnormalized
+dictionary feature learning for a desired sparsity level to best match the data
+is presented. Second, the binary sparse coding problem is then solved on the
+Loihi 1 neuromorphic chip by the use of stochastic networks of neurons to
+traverse the non-convex energy landscape. The solutions are benchmarked against
+the classical heuristic simulated annealing. We demonstrate neuromorphic
+computing is suitable for sampling low energy solutions of binary sparse coding
+QUBO models, and although Loihi 1 is capable of sampling very sparse solutions
+of the QUBO models, there needs to be improvement in the implementation in
+order to be competitive with simulated annealing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-adapted Learning and Imitation: DRL for Power Arbitrage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08360v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08360v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanrong Wang, Vignesh Raja Swaminathan, Nikita P. Granger, Carlos Ros Perez, Christian Michler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we discuss the Dutch power market, which is comprised of a
+day-ahead market and an intraday balancing market that operates like an
+auction. Due to fluctuations in power supply and demand, there is often an
+imbalance that leads to different prices in the two markets, providing an
+opportunity for arbitrage. To address this issue, we restructure the problem
+and propose a collaborative dual-agent reinforcement learning approach for this
+bi-level simulation and optimization of European power arbitrage trading. We
+also introduce two new implementations designed to incorporate domain-specific
+knowledge by imitating the trading behaviours of power traders. By utilizing
+reward engineering to imitate domain expertise, we are able to reform the
+reward system for the RL agent, which improves convergence during training and
+enhances overall performance. Additionally, the tranching of orders increases
+bidding success rates and significantly boosts profit and loss (P&L). Our study
+demonstrates that by leveraging domain expertise in a general learning problem,
+the performance can be improved substantially, and the final integrated
+approach leads to a three-fold improvement in cumulative P&L compared to the
+original agent. Furthermore, our methodology outperforms the highest benchmark
+policy by around 50% while maintaining efficient computational performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fabricated Flips: Poisoning Federated Learning without Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.05877v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.05877v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyue Huang, Zilong Zhao, Lydia Y. Chen, Stefanie Roos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attacks on Federated Learning (FL) can severely reduce the quality of the
+generated models and limit the usefulness of this emerging learning paradigm
+that enables on-premise decentralized learning. However, existing untargeted
+attacks are not practical for many scenarios as they assume that i) the
+attacker knows every update of benign clients, or ii) the attacker has a large
+dataset to locally train updates imitating benign parties. In this paper, we
+propose a data-free untargeted attack (DFA) that synthesizes malicious data to
+craft adversarial models without eavesdropping on the transmission of benign
+clients at all or requiring a large quantity of task-specific training data. We
+design two variants of DFA, namely DFA-R and DFA-G, which differ in how they
+trade off stealthiness and effectiveness. Specifically, DFA-R iteratively
+optimizes a malicious data layer to minimize the prediction confidence of all
+outputs of the global model, whereas DFA-G interactively trains a malicious
+data generator network by steering the output of the global model toward a
+particular class. Experimental results on Fashion-MNIST, Cifar-10, and SVHN
+show that DFA, despite requiring fewer assumptions than existing attacks,
+achieves similar or even higher attack success rate than state-of-the-art
+untargeted attacks against various state-of-the-art defense mechanisms.
+Concretely, they can evade all considered defense mechanisms in at least 50% of
+the cases for CIFAR-10 and often reduce the accuracy by more than a factor of
+2. Consequently, we design REFD, a defense specifically crafted to protect
+against data-free attacks. REFD leverages a reference dataset to detect updates
+that are biased or have a low confidence. It greatly improves upon existing
+defenses by filtering out the malicious updates and achieves high global model
+accuracy
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Q(D)O-ES: Population-based Quality (Diversity) Optimisation for Post Hoc
+  Ensemble Selection in AutoML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Purucker, Lennart Schneider, Marie Anastacio, Joeran Beel, Bernd Bischl, Holger Hoos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated machine learning (AutoML) systems commonly ensemble models post hoc
+to improve predictive performance, typically via greedy ensemble selection
+(GES). However, we believe that GES may not always be optimal, as it performs a
+simple deterministic greedy search. In this work, we introduce two novel
+population-based ensemble selection methods, QO-ES and QDO-ES, and compare them
+to GES. While QO-ES optimises solely for predictive performance, QDO-ES also
+considers the diversity of ensembles within the population, maintaining a
+diverse set of well-performing ensembles during optimisation based on ideas of
+quality diversity optimisation. The methods are evaluated using 71
+classification datasets from the AutoML benchmark, demonstrating that QO-ES and
+QDO-ES often outrank GES, albeit only statistically significant on validation
+data. Our results further suggest that diversity can be beneficial for post hoc
+ensembling but also increases the risk of overfitting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages main paper, 24 pages references and appendix, 4 figures, 16
+  subfigures, 13 tables, to be published in: International Conference on
+  Automated Machine Learning 2023; affiliations corrected. arXiv admin note:
+  text overlap with arXiv:2307.00286</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bandit based centralized matching in two-sided markets for peer to peer
+  lending 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.02589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.02589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumajyoti Sarkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential fundraising in two sided online platforms enable peer to peer
+lending by sequentially bringing potential contributors, each of whose
+decisions impact other contributors in the market. However, understanding the
+dynamics of sequential contributions in online platforms for peer lending has
+been an open ended research question. The centralized investment mechanism in
+these platforms makes it difficult to understand the implicit competition that
+borrowers face from a single lender at any point in time. Matching markets are
+a model of pairing agents where the preferences of agents from both sides in
+terms of their preferred pairing for transactions can allow to decentralize the
+market. We study investment designs in two sided platforms using matching
+markets when the investors or lenders also face restrictions on the investments
+based on borrower preferences. This situation creates an implicit competition
+among the lenders in addition to the existing borrower competition, especially
+when the lenders are uncertain about their standing in the market and thereby
+the probability of their investments being accepted or the borrower loan
+requests for projects reaching the reserve price. We devise a technique based
+on sequential decision making that allows the lenders to adjust their choices
+based on the dynamics of uncertainty from competition over time. We simulate
+two sided market matchings in a sequential decision framework and show the
+dynamics of the lender regret amassed compared to the optimal borrower-lender
+matching and find that the lender regret depends on the initial preferences set
+by the lenders which could affect their learning over decision making steps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2011.04400</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evolutionary Augmentation Policy Optimization for <span class="highlight-title">Self-supervised</span>
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01584v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01584v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noah Barrett, Zahra Sadeghi, Stan Matwin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised Learning (SSL) is a machine learning algorithm for
+pretraining Deep Neural Networks (DNNs) without requiring manually labeled
+data. The central idea of this learning technique is based on an auxiliary
+stage aka pretext task in which labeled data are created automatically through
+data augmentation and exploited for pretraining the DNN. However, the effect of
+each pretext task is not well studied or compared in the literature. In this
+paper, we study the contribution of augmentation operators on the performance
+of self supervised learning algorithms in a constrained settings. We propose an
+evolutionary search method for optimization of data augmentation pipeline in
+pretext tasks and measure the impact of augmentation operators in several SOTA
+SSL algorithms. By encoding different combination of augmentation operators in
+chromosomes we seek the optimal augmentation policies through an evolutionary
+optimization mechanism. We further introduce methods for analyzing and
+explaining the performance of optimized SSL algorithms. Our results indicate
+that our proposed method can find solutions that outperform the accuracy of
+classification of SSL algorithms which confirms the influence of augmentation
+policy choice on the overall performance of SSL algorithms. We also compare
+optimal SSL solutions found by our evolutionary search mechanism and show the
+effect of batch size in the pretext task on two visual datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MARIO: Model Agnostic Recipe for Improving OOD Generalization of Graph
+  Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13055v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13055v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Zhu, Haizhou Shi, Zhenshuo Zhang, Siliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the problem of out-of-distribution (OOD)
+generalization for unsupervised learning methods on graph data. This scenario
+is particularly challenging because graph neural networks (GNNs) have been
+shown to be sensitive to distributional shifts, even when labels are available.
+To address this challenge, we propose a \underline{M}odel-\underline{A}gnostic
+\underline{R}ecipe for \underline{I}mproving \underline{O}OD generalizability
+of unsupervised graph contrastive learning methods, which we refer to as MARIO.
+MARIO introduces two principles aimed at developing distributional-shift-robust
+graph contrastive methods to overcome the limitations of existing frameworks:
+(i) Information Bottleneck (IB) principle for achieving generalizable
+representations and (ii) Invariant principle that incorporates adversarial data
+augmentation to obtain invariant representations. To the best of our knowledge,
+this is the first work that investigates the OOD generalization problem of
+graph contrastive learning, with a specific focus on node-level tasks. Through
+extensive experiments, we demonstrate that our method achieves state-of-the-art
+performance on the OOD test set, while maintaining comparable performance on
+the in-distribution test set when compared to existing approaches. The source
+code for our method can be found at: https://github.com/ZhuYun97/MARIO
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instance-Dependent Generalization Bounds via Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01258v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01258v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songyan Hou, Parnian Kassraie, Anastasis Kratsios, Jonas Rothfuss, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing generalization bounds fail to explain crucial factors that drive
+generalization of modern neural networks. Since such bounds often hold
+uniformly over all parameters, they suffer from over-parametrization, and fail
+to account for the strong inductive bias of initialization and stochastic
+gradient descent. As an alternative, we propose a novel optimal transport
+interpretation of the generalization problem. This allows us to derive
+instance-dependent generalization bounds that depend on the local Lipschitz
+regularity of the earned prediction function in the data space. Therefore, our
+bounds are agnostic to the parametrization of the model and work well when the
+number of training samples is much smaller than the number of parameters. With
+small modifications, our approach yields accelerated rates for data on
+low-dimensional manifolds, and guarantees under distribution shifts. We
+empirically analyze our generalization bounds for neural networks, showing that
+the bound values are meaningful and capture the effect of popular
+regularization methods during training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Black Box Variational Inference with a Deterministic Objective: Faster,
+  More Accurate, and Even More Black Box 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Giordano, Martin Ingram, Tamara Broderick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic differentiation variational inference (ADVI) offers fast and
+easy-to-use posterior approximation in multiple modern probabilistic
+programming languages. However, its stochastic optimizer lacks clear
+convergence criteria and requires tuning parameters. Moreover, ADVI inherits
+the poor posterior uncertainty estimates of mean-field variational Bayes
+(MFVB). We introduce ``deterministic ADVI'' (DADVI) to address these issues.
+DADVI replaces the intractable MFVB objective with a fixed Monte Carlo
+approximation, a technique known in the stochastic optimization literature as
+the ``sample average approximation'' (SAA). By optimizing an approximate but
+deterministic objective, DADVI can use off-the-shelf second-order optimization,
+and, unlike standard mean-field ADVI, is amenable to more accurate posterior
+covariances via linear response (LR). In contrast to existing worst-case
+theory, we show that, on certain classes of common statistical problems, DADVI
+and the SAA can perform well with relatively few samples even in very high
+dimensions, though we also show that such favorable results cannot extend to
+variational approximations that are too expressive relative to mean-field ADVI.
+We show on a variety of real-world problems that DADVI reliably finds good
+solutions with default settings (unlike ADVI) and, together with LR
+covariances, is typically faster and more accurate than standard ADVI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TeleViT: Teleconnection-driven <span class="highlight-title">Transformer</span>s Improve Subseasonal to
+  Seasonal Wildfire Forecasting <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Prapas, Nikolaos Ioannis Bountos, Spyros Kondylatos, Dimitrios Michail, Gustau Camps-Valls, Ioannis Papoutsis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wildfires are increasingly exacerbated as a result of climate change,
+necessitating advanced proactive measures for effective mitigation. It is
+important to forecast wildfires weeks and months in advance to plan forest fuel
+management, resource procurement and allocation. To achieve such accurate
+long-term forecasts at a global scale, it is crucial to employ models that
+account for the Earth system's inherent spatio-temporal interactions, such as
+memory effects and teleconnections. We propose a teleconnection-driven vision
+transformer (TeleViT), capable of treating the Earth as one interconnected
+system, integrating fine-grained local-scale inputs with global-scale inputs,
+such as climate indices and coarse-grained global variables. Through
+comprehensive experimentation, we demonstrate the superiority of TeleViT in
+accurately predicting global burned area patterns for various forecasting
+windows, up to four months in advance. The gain is especially pronounced in
+larger forecasting windows, demonstrating the improved ability of deep learning
+models that exploit teleconnections to capture Earth system dynamics. Code
+available at https://github.com/Orion-Ai-Lab/TeleViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the ICCV 2023 workshop on Artificial Intelligence for
+  Humanitarian Assistance and Disaster Response</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Soft-Contrastive Learning via Neighborhood Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.13964v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.13964v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Ning, Pengfei Wang, Pengyang Wang, Ziyue Qiao, Wei Fan, Denghui Zhang, Yi Du, Yuanchun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Contrastive Learning (GCL) has emerged as a promising approach in the
+realm of graph self-supervised learning. Prevailing GCL methods mainly derive
+from the principles of contrastive learning in the field of computer vision:
+modeling invariance by specifying absolutely similar pairs. However, when
+applied to graph data, this paradigm encounters two significant limitations:
+(1) the validity of the generated views cannot be guaranteed: graph
+perturbation may produce invalid views against semantics and intrinsic topology
+of graph data; (2) specifying absolutely similar pairs in the graph views is
+unreliable: for abstract and non-Euclidean graph data, it is difficult for
+humans to decide the absolute similarity and dissimilarity intuitively. Despite
+the notable performance of current GCL methods, these challenges necessitate a
+reevaluation: Could GCL be more effectively tailored to the intrinsic
+properties of graphs, rather than merely adopting principles from computer
+vision? In response to this query, we propose a novel paradigm, Graph
+Soft-Contrastive Learning (GSCL). This approach facilitates GCL via
+neighborhood ranking, avoiding the need to specify absolutely similar pairs.
+GSCL leverages the underlying graph characteristic of diminishing label
+consistency, asserting that nodes that are closer in the graph are overall more
+similar than far-distant nodes. Within the GSCL framework, we introduce
+pairwise and listwise gated ranking InfoNCE loss functions to effectively
+preserve the relative similarity ranking within neighborhoods. Moreover, as the
+neighborhood size exponentially expands with more hops considered, we propose
+neighborhood sampling strategies to improve learning efficiency. Our extensive
+empirical results across 11 commonly used graph datasets-including 8 homophily
+graphs and 3 heterophily graphs-demonstrate GSCL's superior performance
+compared to 20 SOTA GCL methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust mmWave Beamforming by <span class="highlight-title">Self-Supervised</span> Hybrid Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenghao Zhu, Bohao Wang, Zhaohui Yang, Chongwen Huang, Zhaoyang Zhang, George C. Alexandropoulos, Chau Yuen, Merouane Debbah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beamforming with large-scale antenna arrays has been widely used in recent
+years, which is acknowledged as an important part in 5G and incoming 6G. Thus,
+various techniques are leveraged to improve its performance, e.g., deep
+learning, advanced optimization algorithms, etc. Although its performance in
+many previous research scenarios with deep learning is quite attractive,
+usually it drops rapidly when the environment or dataset is changed. Therefore,
+designing effective beamforming network with strong robustness is an open issue
+for the intelligent wireless communications. In this paper, we propose a robust
+beamforming self-supervised network, and verify it in two kinds of different
+datasets with various scenarios. Simulation results show that the proposed
+self-supervised network with hybrid learning performs well in both classic
+DeepMIMO and new WAIR-D dataset with the strong robustness under the various
+environments. Also, we present the principle to explain the rationality of this
+kind of hybrid learning, which is instructive to apply with more kinds of
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Optimization of Expensive Nested Grey-Box Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05150v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05150v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Xu, Yuning Jiang, Bratislav Svetozarevic, Colin N. Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of optimizing a grey-box objective function, i.e.,
+nested function composed of both black-box and white-box functions. A general
+formulation for such grey-box problems is given, which covers the existing
+grey-box optimization formulations as special cases. We then design an
+optimism-driven algorithm to solve it. Under certain regularity assumptions,
+our algorithm achieves similar regret bound as that for the standard black-box
+Bayesian optimization algorithm, up to a constant multiplicative term depending
+on the Lipschitz constants of the functions considered. We further extend our
+method to the constrained case and discuss special cases. For the commonly used
+kernel functions, the regret bounds allow us to derive a convergence rate to
+the optimal solution. Experimental results show that our grey-box optimization
+method empirically improves the speed of finding the global optimal solution
+significantly, as compared to the standard black-box optimization algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FaDIn: Fast Discretized Inference for Hawkes Processes with General
+  Parametric Kernels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04635v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04635v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Staerman, Cédric Allain, Alexandre Gramfort, Thomas Moreau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal point processes (TPP) are a natural tool for modeling event-based
+data. Among all TPP models, Hawkes processes have proven to be the most widely
+used, mainly due to their adequate modeling for various applications,
+particularly when considering exponential or non-parametric kernels. Although
+non-parametric kernels are an option, such models require large datasets. While
+exponential kernels are more data efficient and relevant for specific
+applications where events immediately trigger more events, they are ill-suited
+for applications where latencies need to be estimated, such as in neuroscience.
+This work aims to offer an efficient solution to TPP inference using general
+parametric kernels with finite support. The developed solution consists of a
+fast $\ell_2$ gradient-based solver leveraging a discretized version of the
+events. After theoretically supporting the use of discretization, the
+statistical and computational efficiency of the novel approach is demonstrated
+through various numerical experiments. Finally, the method's effectiveness is
+evaluated by modeling the occurrence of stimuli-induced patterns from brain
+signals recorded with magnetoencephalography (MEG). Given the use of general
+parametric kernels, results show that the proposed approach leads to an
+improved estimation of pattern latency than the state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Graph Learning from Spatiotemporal Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13492v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13492v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Cini, Daniele Zambon, Cesare Alippi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Outstanding achievements of graph neural networks for spatiotemporal time
+series analysis show that relational constraints introduce an effective
+inductive bias into neural forecasting architectures. Often, however, the
+relational information characterizing the underlying data-generating process is
+unavailable and the practitioner is left with the problem of inferring from
+data which relational graph to use in the subsequent processing stages. We
+propose novel, principled - yet practical - probabilistic score-based methods
+that learn the relational dependencies as distributions over graphs while
+maximizing end-to-end the performance at task. The proposed graph learning
+framework is based on consolidated variance reduction techniques for Monte
+Carlo score-based gradient estimation, is theoretically grounded, and, as we
+show, effective in practice. In this paper, we focus on the time series
+forecasting problem and show that, by tailoring the gradient estimators to the
+graph learning problem, we are able to achieve state-of-the-art performance
+while controlling the sparsity of the learned graph and the computational
+scalability. We empirically assess the effectiveness of the proposed method on
+synthetic and real-world benchmarks, showing that the proposed solution can be
+used as a stand-alone graph identification procedure as well as a graph
+learning component of an end-to-end forecasting architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in JMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain knowledge-informed Synthetic fault sample generation with Health
+  Data Map for cross-domain Planetary Gearbox Fault Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19569v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19569v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jong Moon Ha, Olga Fink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extensive research has been conducted on fault diagnosis of planetary
+gearboxes using vibration signals and deep learning (DL) approaches. However,
+DL-based methods are susceptible to the domain shift problem caused by varying
+operating conditions of the gearbox. Although domain adaptation and data
+synthesis methods have been proposed to overcome such domain shifts, they are
+often not directly applicable in real-world situations where only healthy data
+is available in the target domain. To tackle the challenge of extreme domain
+shift scenarios where only healthy data is available in the target domain, this
+paper proposes two novel domain knowledge-informed data synthesis methods
+utilizing the health data map (HDMap). The two proposed approaches are referred
+to as scaled CutPaste and FaultPaste. The HDMap is used to physically represent
+the vibration signal of the planetary gearbox as an image-like matrix, allowing
+for visualization of fault-related features. CutPaste and FaultPaste are then
+applied to generate faulty samples based on the healthy data in the target
+domain, using domain knowledge and fault signatures extracted from the source
+domain, respectively. In addition to generating realistic faults, the proposed
+methods introduce scaling of fault signatures for controlled synthesis of
+faults with various severity levels. A case study is conducted on a planetary
+gearbox testbed to evaluate the proposed approaches. The results show that the
+proposed methods are capable of accurately diagnosing faults, even in cases of
+extreme domain shift, and can estimate the severity of faults that have not
+been previously observed in the target domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review / added arXiv identifier / Updated to revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span> and Snowball Graph Convolution Learning for Brain functional
+  network Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16132v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16132v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlong Hu, Yangmin Huang, Shoubin Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced deep learning methods, especially graph neural networks (GNNs), are
+increasingly expected to learn from brain functional network data and predict
+brain disorders. In this paper, we proposed a novel Transformer and snowball
+encoding networks (TSEN) for brain functional network classification, which
+introduced Transformer architecture with graph snowball connection into GNNs
+for learning whole-graph representation. TSEN combined graph snowball
+connection with graph Transformer by snowball encoding layers, which enhanced
+the power to capture multi-scale information and global patterns of brain
+functional networks. TSEN also introduced snowball graph convolution as
+position embedding in Transformer structure, which was a simple yet effective
+method for capturing local patterns naturally. We evaluated the proposed model
+by two large-scale brain functional network datasets from autism spectrum
+disorder and major depressive disorder respectively, and the results
+demonstrated that TSEN outperformed the state-of-the-art GNN models and the
+graph-transformer based GNN models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Prepared to submit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BrainNPT: <span class="highlight-title">Pre-train</span>ing of <span class="highlight-title">Transformer</span> networks for brain network
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01666v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01666v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlong Hu, Yangmin Huang, Nan Wang, Shoubin Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning methods have advanced quickly in brain imaging analysis over
+the past few years, but they are usually restricted by the limited labeled
+data. Pre-trained model on unlabeled data has presented promising improvement
+in feature learning in many domains, including natural language processing and
+computer vision. However, this technique is under-explored in brain network
+analysis. In this paper, we focused on pre-training methods with Transformer
+networks to leverage existing unlabeled data for brain functional network
+classification. First, we proposed a Transformer-based neural network, named as
+BrainNPT, for brain functional network classification. The proposed method
+leveraged <cls> token as a classification embedding vector for the Transformer
+model to effectively capture the representation of brain network. Second, we
+proposed a pre-training framework for BrainNPT model to leverage unlabeled
+brain network data to learn the structure information of brain networks. The
+results of classification experiments demonstrated the BrainNPT model without
+pre-training achieved the best performance with the state-of-the-art models,
+and the BrainNPT model with pre-training strongly outperformed the
+state-of-the-art models. The pre-training BrainNPT model improved 8.75% of
+accuracy compared with the model without pre-training. We further compared the
+pre-training strategies, analyzed the influence of the parameters of the model,
+and interpreted the trained model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Prepared to Submit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Thinking Fast and Slow in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilo Hagendorff, Sarah Fabi, Michal Kosinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are currently at the forefront of intertwining
+AI systems with human communication and everyday life. Therefore, it is of
+great importance to evaluate their emerging abilities. In this study, we show
+that LLMs like GPT-3 exhibit behavior that strikingly resembles human-like
+intuition - and the cognitive errors that come with it. However, LLMs with
+higher cognitive capabilities, in particular ChatGPT and GPT-4, learned to
+avoid succumbing to these errors and perform in a hyperrational manner. For our
+experiments, we probe LLMs with the Cognitive Reflection Test (CRT) as well as
+semantic illusions that were originally designed to investigate intuitive
+decision-making in humans. Our study demonstrates that investigating LLMs with
+methods from psychology has the potential to reveal otherwise unknown emergent
+traits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Successor Feature Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.15701v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.15701v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chris Reinke, Xavier Alameda-Pineda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer in Reinforcement Learning aims to improve learning performance on
+target tasks using knowledge from experienced source tasks. Successor
+Representations (SR) and their extension Successor Features (SF) are prominent
+transfer mechanisms in domains where reward functions change between tasks.
+They reevaluate the expected return of previously learned policies in a new
+target task to transfer their knowledge. The SF framework extended SR by
+linearly decomposing rewards into successor features and a reward weight vector
+allowing their application in high-dimensional tasks. But this came with the
+cost of having a linear relationship between reward functions and successor
+features, limiting its application to tasks where such a linear relationship
+exists. We propose a novel formulation of SR based on learning the cumulative
+discounted probability of successor features, called Successor Feature
+Representations (SFR). Crucially, SFR allows to reevaluate the expected return
+of policies for general reward functions. We introduce different SFR
+variations, prove its convergence, and provide a guarantee on its transfer
+performance. Experimental evaluations based on SFR with function approximation
+demonstrate its advantage over SF not only for general reward functions, but
+also in the case of linearly decomposable reward functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published in Transactions on Machine Learning Research (05/2023),
+  source code: https://gitlab.inria.fr/robotlearn/sfr_learning, [v2] added
+  experiments with learned features, [v3] renamed paper and changed scope, [v4]
+  published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The rate of convergence of Bregman proximal methods: Local geometry vs.
+  regularity vs. sharpness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Waïss Azizian, Franck Iutzeler, Jérôme Malick, Panayotis Mertikopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We examine the last-iterate convergence rate of Bregman proximal methods -
+from mirror descent to mirror-prox and its optimistic variants - as a function
+of the local geometry induced by the prox-mapping defining the method. For
+generality, we focus on local solutions of constrained, non-monotone
+variational inequalities, and we show that the convergence rate of a given
+method depends sharply on its associated Legendre exponent, a notion that
+measures the growth rate of the underlying Bregman function (Euclidean,
+entropic, or other) near a solution. In particular, we show that boundary
+solutions exhibit a stark separation of regimes between methods with a zero and
+non-zero Legendre exponent: the former converge at a linear rate, while the
+latter converge, in general, sublinearly. This dichotomy becomes even more
+pronounced in linearly constrained problems where methods with entropic
+regularization achieve a linear convergence rate along sharp directions,
+compared to convergence in a finite number of steps under Euclidean
+regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 2 tables, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Efficiently Plan Robust Frictional Multi-Object Grasps <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07420v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07420v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wisdom C. Agboh, Satvik Sharma, Kishore Srinivas, Mallika Parulekar, Gaurav Datta, Tianshuang Qiu, Jeffrey Ichnowski, Eugen Solowjow, Mehmet Dogar, Ken Goldberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a decluttering problem where multiple rigid convex polygonal
+objects rest in randomly placed positions and orientations on a planar surface
+and must be efficiently transported to a packing box using both single and
+multi-object grasps. Prior work considered frictionless multi-object grasping.
+In this paper, we introduce friction to increase the number of potential grasps
+for a given group of objects, and thus increase picks per hour. We train a
+neural network using real examples to plan robust multi-object grasps. In
+physical experiments, we find a 13.7% increase in success rate, a 1.6x increase
+in picks per hour, and a 6.3x decrease in grasp planning time compared to prior
+work on multi-object grasping. Compared to single-object grasping, we find a
+3.1x increase in picks per hour.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Miao, Yee Whye Teh, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in large language models (LLMs), especially the invention
+of chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning
+problems. However, even the strongest LLMs are still struggling with more
+complicated problems that require non-linear thinking and multi-step reasoning.
+In this work, we explore whether LLMs have the ability to recognize their own
+errors, without resorting to external resources. In particular, we investigate
+whether they can be used to identify individual errors within a step-by-step
+reasoning. To this end, we propose a zero-shot verification scheme to recognize
+such errors. We then use this verification scheme to improve question-answering
+performance, by using it to perform weighted voting on different generated
+answers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and
+find that it successfully recognizes errors and, in turn, increases final
+predictive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProMix: Combating Label Noise via Maximizing Clean Sample Utility <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10276v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10276v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixuan Xiao, Yiwen Dong, Haobo Wang, Lei Feng, Runze Wu, Gang Chen, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with Noisy Labels (LNL) has become an appealing topic, as
+imperfectly annotated data are relatively cheaper to obtain. Recent
+state-of-the-art approaches employ specific selection mechanisms to separate
+clean and noisy samples and then apply Semi-Supervised Learning (SSL)
+techniques for improved performance. However, the selection step mostly
+provides a medium-sized and decent-enough clean subset, which overlooks a rich
+set of clean samples. To fulfill this, we propose a novel LNL framework ProMix
+that attempts to maximize the utility of clean samples for boosted performance.
+Key to our method, we propose a matched high confidence selection technique
+that selects those examples with high confidence scores and matched predictions
+with given labels to dynamically expand a base clean sample set. To overcome
+the potential side effect of excessive clean set selection procedure, we
+further devise a novel SSL framework that is able to train balanced and
+unbiased classifiers on the separated clean and noisy samples. Extensive
+experiments demonstrate that ProMix significantly advances the current
+state-of-the-art results on multiple benchmarks with different types and levels
+of noise. It achieves an average improvement of 2.48\% on the CIFAR-N dataset.
+The code is available at https://github.com/Justherozen/ProMix
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Attention-Based Soft Partition Network for Vehicle
+  Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.10401v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.10401v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangrok Lee, Taekang Woo, Sang Hun Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle re-identification helps in distinguishing between images of the same
+and other vehicles. It is a challenging process because of significant
+intra-instance differences between identical vehicles from different views and
+subtle inter-instance differences between similar vehicles. To solve this
+issue, researchers have extracted view-aware or part-specific features via
+spatial attention mechanisms, which usually result in noisy attention maps or
+otherwise require expensive additional annotation for metadata, such as key
+points, to improve the quality. Meanwhile, based on the researchers' insights,
+various handcrafted multi-attention architectures for specific viewpoints or
+vehicle parts have been proposed. However, this approach does not guarantee
+that the number and nature of attention branches will be optimal for real-world
+re-identification tasks. To address these problems, we proposed a new vehicle
+re-identification network based on a multiple soft attention mechanism for
+capturing various discriminative regions from different viewpoints more
+efficiently. Furthermore, this model can significantly reduce the noise in
+spatial attention maps by devising a new method for creating an attention map
+for insignificant regions and then excluding it from generating the final
+result. We also combined a channel-wise attention mechanism with a spatial
+attention mechanism for the efficient selection of important semantic
+attributes for vehicle re-identification. Our experiments showed that our
+proposed model achieved a state-of-the-art performance among the
+attention-based methods without metadata and was comparable to the approaches
+using metadata for the VehicleID and VERI-Wild datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs4OL: Large Language Models for Ontology Learning <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamed Babaei Giglou, Jennifer D'Souza, Sören Auer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the LLMs4OL approach, which utilizes Large Language Models (LLMs)
+for Ontology Learning (OL). LLMs have shown significant advancements in natural
+language processing, demonstrating their ability to capture complex language
+patterns in different knowledge domains. Our LLMs4OL paradigm investigates the
+following hypothesis: \textit{Can LLMs effectively apply their language pattern
+capturing capability to OL, which involves automatically extracting and
+structuring knowledge from natural language text?} To test this hypothesis, we
+conduct a comprehensive evaluation using the zero-shot prompting method. We
+evaluate nine different LLM model families for three main OL tasks: term
+typing, taxonomy discovery, and extraction of non-taxonomic relations.
+Additionally, the evaluations encompass diverse genres of ontological
+knowledge, including lexicosemantic knowledge in WordNet, geographical
+knowledge in GeoNames, and medical knowledge in UMLS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages main content, 27 pages overall, 2 Figures, accepted for
+  publication at ISWC 2023 research track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nexus sine qua non: Essentially Connected Networks for Traffic
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01482v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01482v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Nie, Guoyang Qin, Lijun Sun, Yunpeng Wang, Jian Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal graph neural networks (STGNNs) have become the de facto
+models for learning spatiotemporal representations of traffic flow. However,
+modern STGNNs often contain superfluous or obscure components, along with
+complex techniques, posing significant challenges in terms of complexity and
+scalability. Such concerns prompt us to rethink the design of neural
+architectures and to identify the key challenges in traffic forecasting as
+spatial-temporal contextualization. Here, we present an essentially connected
+model based on an efficient message-passing backbone, powered by learnable node
+embedding, without any complex sequential techniques such as TCNs, RNNs, and
+Transformers. Intriguingly, empirical results demonstrate how a simple and
+elegant model with contextualization capability compares favorably w.r.t. the
+state-of-the-art with elaborate structures, while being much more interpretable
+and computationally efficient for traffic forecasting. We anticipate that our
+findings will open new horizons for further research to explore the possibility
+of creating simple but effective neural forecasting architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedDef: Defense Against Gradient Leakage in Federated Learning-based
+  Network Intrusion Detection Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04052v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04052v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Chen, Yi Zhao, Qi Li, Xuewei Feng, Ke Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) methods have been widely applied to anomaly-based network
+intrusion detection system (NIDS) to detect malicious traffic. To expand the
+usage scenarios of DL-based methods, federated learning (FL) allows multiple
+users to train a global model on the basis of respecting individual data
+privacy. However, it has not yet been systematically evaluated how robust
+FL-based NIDSs are against existing privacy attacks under existing defenses. To
+address this issue, we propose two privacy evaluation metrics designed for
+FL-based NIDSs, including (1) privacy score that evaluates the similarity
+between the original and recovered traffic features using reconstruction
+attacks, and (2) evasion rate against NIDSs using adversarial attack with the
+recovered traffic. We conduct experiments to illustrate that existing defenses
+provide little protection and the corresponding adversarial traffic can even
+evade the SOTA NIDS Kitsune. To defend against such attacks and build a more
+robust FL-based NIDS, we further propose FedDef, a novel optimization-based
+input perturbation defense strategy with theoretical guarantee. It achieves
+both high utility by minimizing the gradient distance and strong privacy
+protection by maximizing the input distance. We experimentally evaluate four
+existing defenses on four datasets and show that our defense outperforms all
+the baselines in terms of privacy protection with up to 7 times higher privacy
+score, while maintaining model accuracy loss within 3% under optimal parameter
+combination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TIFS'23, volume 18</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Potential of Large Language Models (LLMs) in Learning on
+  Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03393v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03393v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Chen, Haitao Mao, Hang Li, Wei Jin, Hongzhi Wen, Xiaochi Wei, Shuaiqiang Wang, Dawei Yin, Wenqi Fan, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning on Graphs has attracted immense attention due to its wide real-world
+applications. The most popular pipeline for learning on graphs with textual
+node attributes primarily relies on Graph Neural Networks (GNNs), and utilizes
+shallow text embedding as initial node representations, which has limitations
+in general knowledge and profound semantic understanding. In recent years,
+Large Language Models (LLMs) have been proven to possess extensive common
+knowledge and powerful semantic comprehension abilities that have
+revolutionized existing workflows to handle text data. In this paper, we aim to
+explore the potential of LLMs in graph machine learning, especially the node
+classification task, and investigate two possible pipelines: LLMs-as-Enhancers
+and LLMs-as-Predictors. The former leverages LLMs to enhance nodes' text
+attributes with their massive knowledge and then generate predictions through
+GNNs. The latter attempts to directly employ LLMs as standalone predictors. We
+conduct comprehensive and systematical studies on these two pipelines under
+various settings. From comprehensive empirical results, we make original
+observations and find new insights that open new possibilities and suggest
+promising directions to leverage LLMs for learning on graphs. Our codes and
+datasets are available at https://github.com/CurryTang/Graph-LLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>add code</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Asynchronous, Option-Based Multi-Agent Policy Gradient: A Conditional
+  Reasoning Approach <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.15925v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.15925v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xubo Lyu, Amin Banitalebi-Dehkordi, Mo Chen, Yong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperative multi-agent problems often require coordination between agents,
+which can be achieved through a centralized policy that considers the global
+state. Multi-agent policy gradient (MAPG) methods are commonly used to learn
+such policies, but they are often limited to problems with low-level action
+spaces. In complex problems with large state and action spaces, it is
+advantageous to extend MAPG methods to use higher-level actions, also known as
+options, to improve the policy search efficiency. However, multi-robot option
+executions are often asynchronous, that is, agents may select and complete
+their options at different time steps. This makes it difficult for MAPG methods
+to derive a centralized policy and evaluate its gradient, as centralized policy
+always select new options at the same time. In this work, we propose a novel,
+conditional reasoning approach to address this problem and demonstrate its
+effectiveness on representative option-based multi-agent cooperative tasks
+through empirical validation. Find code and videos at:
+\href{https://sites.google.com/view/mahrlsupp/}{https://sites.google.com/view/mahrlsupp/}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/RSJ International Conference on Intelligent Robots
+  and Systems (IROS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Technical report: Graph Neural Networks go Grammatical 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01590v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01590v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Piquenot, Aldo Moscatelli, Maxime Bérar, Pierre Héroux, Romain raveaux, Jean-Yves Ramel, Sébastien Adam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a framework to formally link a fragment of an algebraic
+language to a Graph Neural Network (GNN). It relies on Context Free Grammars
+(CFG) to organise algebraic operations into generative rules that can be
+translated into a GNN layer model. Since the rules and variables of a CFG
+directly derived from a language contain redundancies, a grammar reduction
+scheme is presented making tractable the translation into a GNN layer. Applying
+this strategy, a grammar compliant with the third-order Weisfeiler-Lehman
+(3-WL) test is defined from MATLANG. From this 3-WL CFG, we derive a provably
+3-WL GNN model called G$^2$N$^2$. Moreover, this grammatical approach allows us
+to provide algebraic formulas to count the cycles of length up to six and
+chordal cycles at the edge level, which enlightens the counting power of 3-WL.
+Several experiments illustrate that G$^2$N$^2$ efficiently outperforms other
+3-WL GNNs on many downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressive Fourier collocation methods for high-dimensional diffusion
+  equations with periodic boundary conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.01255v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.01255v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiqi Wang, Simone Brugiapaglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-dimensional Partial Differential Equations (PDEs) are a popular
+mathematical modelling tool, with applications ranging from finance to
+computational chemistry. However, standard numerical techniques for solving
+these PDEs are typically affected by the curse of dimensionality. In this work,
+we tackle this challenge while focusing on stationary diffusion equations
+defined over a high-dimensional domain with periodic boundary conditions.
+Inspired by recent progress in sparse function approximation in high
+dimensions, we propose a new method called compressive Fourier collocation.
+Combining ideas from compressive sensing and spectral collocation, our method
+replaces the use of structured collocation grids with Monte Carlo sampling and
+employs sparse recovery techniques, such as orthogonal matching pursuit and
+$\ell^1$ minimization, to approximate the Fourier coefficients of the PDE
+solution. We conduct a rigorous theoretical analysis showing that the
+approximation error of the proposed method is comparable with the best $s$-term
+approximation (with respect to the Fourier basis) to the solution. Using the
+recently introduced framework of random sampling in bounded Riesz systems, our
+analysis shows that the compressive Fourier collocation method mitigates the
+curse of dimensionality with respect to the number of collocation points under
+sufficient conditions on the regularity of the diffusion coefficient. We also
+present numerical experiments that illustrate the accuracy and stability of the
+method for the approximation of sparse and compressible solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Class-incremental Learning with Pre-allocated Fixed Classifiers <span class="chip">ICPR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.08657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.08657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Pernici, Matteo Bruni, Claudio Baecchi, Francesco Turchini, Alberto Del Bimbo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In class-incremental learning, a learning agent faces a stream of data with
+the goal of learning new classes while not forgetting previous ones. Neural
+networks are known to suffer under this setting, as they forget previously
+acquired knowledge. To address this problem, effective methods exploit past
+data stored in an episodic memory while expanding the final classifier nodes to
+accommodate the new classes.
+  In this work, we substitute the expanding classifier with a novel fixed
+classifier in which a number of pre-allocated output nodes are subject to the
+classification loss right from the beginning of the learning phase. Contrarily
+to the standard expanding classifier, this allows: (a) the output nodes of
+future unseen classes to firstly see negative samples since the beginning of
+learning together with the positive samples that incrementally arrive; (b) to
+learn features that do not change their geometric configuration as novel
+classes are incorporated in the learning model.
+  Experiments with public datasets show that the proposed approach is as
+effective as the expanding classifier while exhibiting novel intriguing
+properties of the internal feature representation that are otherwise
+not-existent. Our ablation study on pre-allocating a large number of classes
+further validates the approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICPR 2021 (figure fixed)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">GPT</span>-4 Perform Neural Architecture Search? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10970v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10970v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkai Zheng, Xiu Su, Shan You, Fei Wang, Chen Qian, Chang Xu, Samuel Albanie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the potential of GPT-4~\cite{gpt4} to perform Neural
+Architecture Search (NAS) -- the task of designing effective neural
+architectures. Our proposed approach, \textbf{G}PT-4 \textbf{E}nhanced
+\textbf{N}eural arch\textbf{I}tect\textbf{U}re \textbf{S}earch (GENIUS),
+leverages the generative capabilities of GPT-4 as a black-box optimiser to
+quickly navigate the architecture search space, pinpoint promising candidates,
+and iteratively refine these candidates to improve performance. We assess
+GENIUS across several benchmarks, comparing it with existing state-of-the-art
+NAS techniques to illustrate its effectiveness. Rather than targeting
+state-of-the-art performance, our objective is to highlight GPT-4's potential
+to assist research on a challenging technical problem through a simple
+prompting scheme that requires relatively limited domain
+expertise\footnote{Code available at
+\href{https://github.com/mingkai-zheng/GENIUS}{https://github.com/mingkai-zheng/GENIUS}.}.
+More broadly, we believe our preliminary results point to future research that
+harnesses general purpose language models for diverse optimisation tasks. We
+also highlight important limitations to our study, and note implications for AI
+safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding plasticity in neural networks <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01486v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01486v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clare Lyle, Zeyu Zheng, Evgenii Nikishin, Bernardo Avila Pires, Razvan Pascanu, Will Dabney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plasticity, the ability of a neural network to quickly change its predictions
+in response to new information, is essential for the adaptability and
+robustness of deep reinforcement learning systems. Deep neural networks are
+known to lose plasticity over the course of training even in relatively simple
+learning problems, but the mechanisms driving this phenomenon are still poorly
+understood. This paper conducts a systematic empirical analysis into plasticity
+loss, with the goal of understanding the phenomenon mechanistically in order to
+guide the future development of targeted solutions. We find that loss of
+plasticity is deeply connected to changes in the curvature of the loss
+landscape, but that it often occurs in the absence of saturated units. Based on
+this insight, we identify a number of parameterization and optimization design
+choices which enable networks to better preserve plasticity over the course of
+training. We validate the utility of these findings on larger-scale RL
+benchmarks in the Arcade Learning Environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023 (oral presentation)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Emergency Dust-Free solution on-board International Space
+  Station with Bi-GRU (AED-ISS) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.08549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.08549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-Han Hou, Wei-Chih Lin, Hong-Chun Hou, Yu-Hao Huang, Jih-Hong Shue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With a rising attention for the issue of PM2.5 or PM0.3, particulate matters
+have become not only a potential threat to both the environment and human, but
+also a harming existence to instruments onboard International Space Station
+(ISS). Our team is aiming to relate various concentration of particulate
+matters to magnetic fields, humidity, acceleration, temperature, pressure and
+CO2 concentration. Our goal is to establish an early warning system (EWS),
+which is able to forecast the levels of particulate matters and provides ample
+reaction time for astronauts to protect their instruments in some experiments
+or increase the accuracy of the measurements; In addition, the constructed
+model can be further developed into a prototype of a remote-sensing smoke alarm
+for applications related to fires. In this article, we will implement the
+Bi-GRU (Bidirectional Gated Recurrent Unit) algorithms that collect data for
+past 90 minutes and predict the levels of particulates which over 2.5
+micrometer per 0.1 liter for the next 1 minute, which is classified as an early
+warning
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, and 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Distributed Estimation and Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Papachristou, M. Amin Rahimian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study distributed estimation and learning problems in a networked
+environment in which agents exchange information to estimate unknown
+statistical properties of random variables from their privately observed
+samples. By exchanging information about their private observations, the agents
+can collectively estimate the unknown quantities, but they also face privacy
+risks. The goal of our aggregation schemes is to combine the observed data
+efficiently over time and across the network, while accommodating the privacy
+needs of the agents and without any coordination beyond their local
+neighborhoods. Our algorithms enable the participating agents to estimate a
+complete sufficient statistic from private signals that are acquired offline or
+online over time, and to preserve the privacy of their signals and network
+neighborhoods. This is achieved through linear aggregation schemes with
+adjusted randomization schemes that add noise to the exchanged estimates
+subject to differential privacy (DP) constraints. In every case, we demonstrate
+the efficiency of our algorithms by proving convergence to the estimators of a
+hypothetical, omniscient observer that has central access to all of the
+signals. We also provide convergence rate analysis and finite-time performance
+guarantees and show that the noise that minimizes the convergence time to the
+best estimates is the Laplace noise, with parameters corresponding to each
+agent's sensitivity to their signal and network characteristics. Finally, to
+supplement and validate our theoretical results, we run experiments on
+real-world data from the US Power Grid Network and electric consumption data
+from German Households to estimate the average power consumption of power
+stations and households under all privacy regimes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrated Conditional Estimation-Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.12351v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.12351v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Qi, Paul Grigas, Zuo-Jun Max Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world optimization problems involve uncertain parameters with
+probability distributions that can be estimated using contextual feature
+information. In contrast to the standard approach of first estimating the
+distribution of uncertain parameters and then optimizing the objective based on
+the estimation, we propose an integrated conditional estimation-optimization
+(ICEO) framework that estimates the underlying conditional distribution of the
+random parameter while considering the structure of the optimization problem.
+We directly model the relationship between the conditional distribution of the
+random parameter and the contextual features, and then estimate the
+probabilistic model with an objective that aligns with the downstream
+optimization problem. We show that our ICEO approach is asymptotically
+consistent under moderate regularity conditions and further provide finite
+performance guarantees in the form of generalization bounds. Computationally,
+performing estimation with the ICEO approach is a non-convex and often
+non-differentiable optimization problem. We propose a general methodology for
+approximating the potentially non-differentiable mapping from estimated
+conditional distribution to the optimal decision by a differentiable function,
+which greatly improves the performance of gradient-based algorithms applied to
+the non-convex problem. We also provide a polynomial optimization solution
+approach in the semi-algebraic case. Numerical experiments are also conducted
+to show the empirical success of our approach in different situations including
+with limited data samples and model mismatches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Transfer Learning Approach to Minimize Reinforcement Learning Risks in
+  Energy Optimization for Smart Buildings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Genkin, J. J. McArthur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy optimization leveraging artificially intelligent algorithms has been
+proven effective. However, when buildings are commissioned, there is no
+historical data that could be used to train these algorithms. On-line
+Reinforcement Learning (RL) algorithms have shown significant promise, but
+their deployment carries a significant risk, because as the RL agent initially
+explores its action space it could cause significant discomfort to the building
+residents. In this paper we present ReLBOT - a new technique that uses transfer
+learning in conjunction with deep RL to transfer knowledge from an existing,
+optimized and instrumented building, to the newly commissioning smart building,
+to reduce the adverse impact of the reinforcement learning agent's warm-up
+period. We demonstrate improvements of up to 6.2 times in the duration, and up
+to 132 times in prediction variance, for the reinforcement learning agent's
+warm-up period.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 9 figures, submitted to the journal Energy and Buildings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention Is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1706.03762v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1706.03762v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dominant sequence transduction models are based on complex recurrent or
+convolutional neural networks in an encoder-decoder configuration. The best
+performing models also connect the encoder and decoder through an attention
+mechanism. We propose a new simple network architecture, the Transformer, based
+solely on attention mechanisms, dispensing with recurrence and convolutions
+entirely. Experiments on two machine translation tasks show these models to be
+superior in quality while being more parallelizable and requiring significantly
+less time to train. Our model achieves 28.4 BLEU on the WMT 2014
+English-to-German translation task, improving over the existing best results,
+including ensembles by over 2 BLEU. On the WMT 2014 English-to-French
+translation task, our model establishes a new single-model state-of-the-art
+BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction
+of the training costs of the best models from the literature. We show that the
+Transformer generalizes well to other tasks by applying it successfully to
+English constituency parsing both with large and limited training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Okapi: Instruction-tuned Large Language Models in Multiple Languages
+  with Reinforcement Learning from Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16039v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16039v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Dac Lai, Chien Van Nguyen, Nghia Trung Ngo, Thuat Nguyen, Franck Dernoncourt, Ryan A. Rossi, Thien Huu Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key technology for the development of large language models (LLMs) involves
+instruction tuning that helps align the models' responses with human
+expectations to realize impressive learning abilities. Two major approaches for
+instruction tuning characterize supervised fine-tuning (SFT) and reinforcement
+learning from human feedback (RLHF), which are currently applied to produce the
+best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for
+research and development efforts, various instruction-tuned open-source LLMs
+have also been introduced recently, e.g., Alpaca, Vicuna, to name a few.
+However, existing open-source LLMs have only been instruction-tuned for English
+and a few popular languages, thus hindering their impacts and accessibility to
+many other languages in the world. Among a few very recent work to explore
+instruction tuning for LLMs in multiple languages, SFT has been used as the
+only approach to instruction-tune LLMs for multiple languages. This has left a
+significant gap for fine-tuned LLMs based on RLHF in diverse languages and
+raised important questions on how RLHF can boost the performance of
+multilingual instruction tuning. To overcome this issue, we present Okapi, the
+first system with instruction-tuned LLMs based on RLHF for multiple languages.
+Okapi introduces instruction and response-ranked data in 26 diverse languages
+to facilitate the experiments and development of future multilingual LLM
+research. We also present benchmark datasets to enable the evaluation of
+generative LLMs in multiple languages. Our experiments demonstrate the
+advantages of RLHF for multilingual instruction over SFT for different base
+models and datasets. Our framework and resources are released at
+https://github.com/nlp-uoregon/Okapi.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiviML: A Module-based Heuristic for Mapping Neural Networks onto
+  Heterogeneous Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yassine Ghannane, Mohamed S. Abdelfattah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Datacenters are increasingly becoming heterogeneous, and are starting to
+include specialized hardware for networking, video processing, and especially
+deep learning. To leverage the heterogeneous compute capability of modern
+datacenters, we develop an approach for compiler-level partitioning of deep
+neural networks (DNNs) onto multiple interconnected hardware devices. We
+present a general framework for heterogeneous DNN compilation, offering
+automatic partitioning and device mapping. Our scheduler integrates both an
+exact solver, through a mixed integer linear programming (MILP) formulation,
+and a modularity-based heuristic for scalability. Furthermore, we propose a
+theoretical lower bound formula for the optimal solution, which enables the
+assessment of the heuristic solutions' quality. We evaluate our scheduler in
+optimizing both conventional DNNs and randomly-wired neural networks, subject
+to latency and throughput constraints, on a heterogeneous system comprised of a
+CPU and two distinct GPUs. Compared to na\"ively running DNNs on the fastest
+GPU, he proposed framework can achieve more than 3$\times$ times lower latency
+and up to 2.9$\times$ higher throughput by automatically leveraging both data
+and model parallelism to deploy DNNs on our sample heterogeneous server node.
+Moreover, our modularity-based "splitting" heuristic improves the solution
+runtime up to 395$\times$ without noticeably sacrificing solution quality
+compared to an exact MILP solution, and outperforms all other heuristics by
+30-60% solution quality. Finally, our case study shows how we can extend our
+framework to schedule large language models across multiple heterogeneous
+servers by exploiting symmetry in the hardware setup. Our code can be easily
+plugged in to existing frameworks, and is available at
+https://github.com/abdelfattah-lab/diviml.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ICCAD'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnyTeleop: A General Vision-Based Dexterous Robot Arm-Hand Teleoperation
+  System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04577v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04577v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhe Qin, Wei Yang, Binghao Huang, Karl Van Wyk, Hao Su, Xiaolong Wang, Yu-Wei Chao, Dieter Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based teleoperation offers the possibility to endow robots with
+human-level intelligence to physically interact with the environment, while
+only requiring low-cost camera sensors. However, current vision-based
+teleoperation systems are designed and engineered towards a particular robot
+model and deploy environment, which scales poorly as the pool of the robot
+models expands and the variety of the operating environment increases. In this
+paper, we propose AnyTeleop, a unified and general teleoperation system to
+support multiple different arms, hands, realities, and camera configurations
+within a single system. Although being designed to provide great flexibility to
+the choice of simulators and real hardware, our system can still achieve great
+performance. For real-world experiments, AnyTeleop can outperform a previous
+system that was designed for a specific robot hardware with a higher success
+rate, using the same robot. For teleoperation in simulation, AnyTeleop leads to
+better imitation learning performance, compared with a previous system that is
+particularly designed for that simulator. Project page: http://anyteleop.com/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>http://anyteleop.com/ Robotics: Science and Systems 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An efficient, provably exact, practical algorithm for the 0-1 loss
+  linear classification problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi He, Waheed Ul Rahman, Max A. Little
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithms for solving the linear classification problem have a long history,
+dating back at least to 1936 with linear discriminant analysis. For linearly
+separable data, many algorithms can obtain the exact solution to the
+corresponding 0-1 loss classification problem efficiently, but for data which
+is not linearly separable, it has been shown that this problem, in full
+generality, is NP-hard. Alternative approaches all involve approximations of
+some kind, including the use of surrogates for the 0-1 loss (for example, the
+hinge or logistic loss) or approximate combinatorial search, none of which can
+be guaranteed to solve the problem exactly. Finding efficient algorithms to
+obtain an exact i.e. globally optimal solution for the 0-1 loss linear
+classification problem with fixed dimension, remains an open problem. In
+research we report here, we detail the rigorous construction of a new
+algorithm, incremental cell enumeration (ICE), that can solve the 0-1 loss
+classification problem exactly in polynomial time. We prove correctness using
+concepts from the theory of hyperplane arrangements and oriented matroids. We
+demonstrate the effectiveness of this algorithm on synthetic and real-world
+datasets, showing optimal accuracy both in and out-of-sample, in practical
+computational time. We also empirically demonstrate how the use of approximate
+upper bound leads to polynomial time run-time improvements to the algorithm
+whilst retaining exactness. To our knowledge, this is the first,
+rigorously-proven polynomial time, practical algorithm for this long-standing
+problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstructing Turbulent Flows Using Physics-Aware Spatio-Temporal
+  Dynamics and Test-Time Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Chen, Tianshu Bao, Peyman Givi, Can Zheng, Xiaowei Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating turbulence is critical for many societally important applications
+in aerospace engineering, environmental science, the energy industry, and
+biomedicine. Large eddy simulation (LES) has been widely used as an alternative
+to direct numerical simulation (DNS) for simulating turbulent flows due to its
+reduced computational cost. However, LES is unable to capture all of the scales
+of turbulent transport accurately. Reconstructing DNS from low-resolution LES
+is critical for many scientific and engineering disciplines, but it poses many
+challenges to existing super-resolution methods due to the spatio-temporal
+complexity of turbulent flows. In this work, we propose a new physics-guided
+neural network for reconstructing the sequential DNS from low-resolution LES
+data. The proposed method leverages the partial differential equation that
+underlies the flow dynamics in the design of spatio-temporal model
+architecture. A degradation-based refinement method is also developed to
+enforce physical constraints and further reduce the accumulated reconstruction
+errors over long periods. The results on two different types of turbulent flow
+data confirm the superiority of the proposed method in reconstructing the
+high-resolution DNS data and preserving the physical characteristics of flow
+transport.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Successor Feature Neural Episodic Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.03110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.03110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Emukpere, Xavier Alameda-Pineda, Chris Reinke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A longstanding goal in reinforcement learning is to build intelligent agents
+that show fast learning and a flexible transfer of skills akin to humans and
+animals. This paper investigates the integration of two frameworks for tackling
+those goals: episodic control and successor features. Episodic control is a
+cognitively inspired approach relying on episodic memory, an instance-based
+memory model of an agent's experiences. Meanwhile, successor features and
+generalized policy improvement (SF&GPI) is a meta and transfer learning
+framework allowing to learn policies for tasks that can be efficiently reused
+for later tasks which have a different reward function. Individually, these two
+techniques have shown impressive results in vastly improving sample efficiency
+and the elegant reuse of previously learned policies. Thus, we outline a
+combination of both approaches in a single reinforcement learning framework and
+empirically illustrate its benefits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nearest Neighbour with Bandit Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Pasteris, Chris Hicks, Vasilios Mavroudis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we adapt the nearest neighbour rule to the contextual bandit
+problem. Our algorithm handles the fully adversarial setting in which no
+assumptions at all are made about the data-generation process. When combined
+with a sufficiently fast data-structure for (perhaps approximate) adaptive
+nearest neighbour search, such as a navigating net, our algorithm is extremely
+efficient - having a per trial running time polylogarithmic in both the number
+of trials and actions, and taking only quasi-linear space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatically Bounding the Taylor Remainder Series: Tighter Bounds and
+  New Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.11429v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.11429v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Streeter, Joshua V. Dillon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new algorithm for automatically bounding the Taylor remainder
+series. In the special case of a scalar function $f: \mathbb{R} \to
+\mathbb{R}$, our algorithm takes as input a reference point $x_0$, trust region
+$[a, b]$, and integer $k \ge 1$, and returns an interval $I$ such that $f(x) -
+\sum_{i=0}^{k-1} \frac {1} {i!} f^{(i)}(x_0) (x - x_0)^i \in I (x - x_0)^k$ for
+all $x \in [a, b]$. As in automatic differentiation, the function $f$ is
+provided to the algorithm in symbolic form, and must be composed of known
+atomic functions.
+  At a high level, our algorithm has two steps. First, for a variety of
+commonly-used elementary functions (e.g., $\exp$, $\log$), we use
+recently-developed theory to derive sharp polynomial upper and lower bounds on
+the Taylor remainder series. We then recursively combine the bounds for the
+elementary functions using an interval arithmetic variant of Taylor-mode
+automatic differentiation. Our algorithm can make efficient use of machine
+learning hardware accelerators, and we provide an open source implementation in
+JAX.
+  We then turn our attention to applications. Most notably, in a companion
+paper we use our new machinery to create the first universal
+majorization-minimization optimization algorithms: algorithms that iteratively
+minimize an arbitrary loss using a majorizer that is derived automatically,
+rather than by hand. We also show that our automatically-derived bounds can be
+used for verified global optimization and numerical integration, and to prove
+sharper versions of Jensen's inequality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Previous version has been split into 3 articles: arXiv:2308.00679,
+  arXiv:2308.00190, and this article</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenAGI: When LLM Meets Domain Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04370v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04370v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingqiang Ge, Wenyue Hua, Kai Mei, Jianchao Ji, Juntao Tan, Shuyuan Xu, Zelong Li, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human intelligence excels at combining basic skills to solve complex tasks.
+This capability is vital for Artificial Intelligence (AI) and should be
+embedded in comprehensive intelligent models, enabling them to harness expert
+models for complex task-solving towards Artificial General Intelligence (AGI).
+Large Language Models (LLMs) show promising learning and reasoning abilities,
+and can effectively use external models, tools or APIs to tackle complex
+problems. In this work, we introduce OpenAGI, an open-source AGI research
+platform designed for multi-step, real-world tasks. Specifically, OpenAGI uses
+a dual strategy, integrating standard benchmark tasks for benchmarking and
+evaluation, and open-ended tasks including more expandable models, tools or
+APIs for creative problem-solving. Tasks are presented as natural language
+queries to the LLM, which then selects and executes appropriate models. We also
+propose a Reinforcement Learning from Task Feedback (RLTF) mechanism that uses
+task results to improve the LLM's ability, which creates a self-improving AI
+feedback loop. While we acknowledge that AGI is a broad and multifaceted
+research challenge with no singularly defined solution path, the integration of
+LLMs with domain-specific expert models, inspired by mirroring the blend of
+general and specialized intelligence in humans, offers a promising approach
+towards AGI. We are open-sourcing the OpenAGI project's code, dataset,
+benchmarks, evaluation methods, and demo to foster community involvement in AGI
+advancement: https://github.com/agiresearch/OpenAGI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Sparsity Can Simplify Machine Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04934v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04934v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghan Jia, Jiancheng Liu, Parikshit Ram, Yuguang Yao, Gaowen Liu, Yang Liu, Pranay Sharma, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to recent data regulation requirements, machine unlearning (MU)
+has emerged as a critical process to remove the influence of specific examples
+from a given model. Although exact unlearning can be achieved through complete
+model retraining using the remaining dataset, the associated computational
+costs have driven the development of efficient, approximate unlearning
+techniques. Moving beyond data-centric MU approaches, our study introduces a
+novel model-based perspective: model sparsification via weight pruning, which
+is capable of reducing the gap between exact unlearning and approximate
+unlearning. We show in both theory and practice that model sparsity can boost
+the multi-criteria unlearning performance of an approximate unlearner, closing
+the approximation gap, while continuing to be efficient. This leads to a new MU
+paradigm, termed prune first, then unlearn, which infuses a sparse model prior
+into the unlearning process. Building on this insight, we also develop a
+sparsity-aware unlearning method that utilizes sparsity regularization to
+enhance the training process of approximate unlearning. Extensive experiments
+show that our proposals consistently benefit MU in various unlearning
+scenarios. A notable highlight is the 77% unlearning efficacy gain of
+fine-tuning (one of the simplest unlearning methods) when using sparsity-aware
+unlearning. Furthermore, we demonstrate the practical impact of our proposed MU
+methods in addressing other machine learning challenges, such as defending
+against backdoor attacks and enhancing transfer learning. Codes are available
+at https://github.com/OPTML-Group/Unlearn-Sparse.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hebbian Deep Learning Without Feedback <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11883v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11883v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrien Journé, Hector Garcia Rodriguez, Qinghai Guo, Timoleon Moraitis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent approximations to backpropagation (BP) have mitigated many of BP's
+computational inefficiencies and incompatibilities with biology, but important
+limitations still remain. Moreover, the approximations significantly decrease
+accuracy in benchmarks, suggesting that an entirely different approach may be
+more fruitful. Here, grounded on recent theory for Hebbian learning in soft
+winner-take-all networks, we present multilayer SoftHebb, i.e. an algorithm
+that trains deep neural networks, without any feedback, target, or error
+signals. As a result, it achieves efficiency by avoiding weight transport,
+non-local plasticity, time-locking of layer updates, iterative equilibria, and
+(self-) supervisory or other feedback signals -- which were necessary in other
+approaches. Its increased efficiency and biological compatibility do not trade
+off accuracy compared to state-of-the-art bio-plausible learning, but rather
+improve it. With up to five hidden layers and an added linear classifier,
+accuracies on MNIST, CIFAR-10, STL-10, and ImageNet, respectively reach 99.4%,
+80.3%, 76.2%, and 27.3%. In conclusion, SoftHebb shows with a radically
+different approach from BP that Deep Learning over few layers may be plausible
+in the brain and increases the accuracy of bio-plausible machine learning. Code
+is available at https://github.com/NeuromorphicComputing/SoftHebb.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated to match the published version of the the ICLR 2023 paper
+  (notable-top 25%)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment
+  for Markup-to-Image Generation <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guojin Zhong, Jin Yuan, Pan Wang, Kailun Yang, Weili Guan, Zhiyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently rising markup-to-image generation poses greater challenges as
+compared to natural image generation, due to its low tolerance for errors as
+well as the complex sequence and context correlations between markup and
+rendered image. This paper proposes a novel model named "Contrast-augmented
+Diffusion Model with Fine-grained Sequence Alignment" (FSA-CDM), which
+introduces contrastive positive/negative samples into the diffusion model to
+boost performance for markup-to-image generation. Technically, we design a
+fine-grained cross-modal alignment module to well explore the sequence
+similarity between the two modalities for learning robust feature
+representations. To improve the generalization ability, we propose a
+contrast-augmented diffusion model to explicitly explore positive and negative
+samples by maximizing a novel contrastive variational objective, which is
+mathematically inferred to provide a tighter bound for the model's
+optimization. Moreover, the context-aware cross attention module is developed
+to capture the contextual information within markup language during the
+denoising process, yielding better noise prediction results. Extensive
+experiments are conducted on four benchmark datasets from different domains,
+and the experimental results demonstrate the effectiveness of the proposed
+components in FSA-CDM, significantly exceeding state-of-the-art performance by
+about 2%-12% DTW improvements. The code will be released at
+https://github.com/zgj77/FSACDM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM MM 2023. The code will be released at
+  https://github.com/zgj77/FSACDM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Generic: Enhancing Image Captioning with Real-World Knowledge
+  using Vision-Language <span class="highlight-title">Pre-Train</span>ing Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanzhi Cheng, Wenpo Song, Zheng Ma, Wenhao Zhu, Zixuan Zhu, Jianbing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current captioning approaches tend to generate correct but "generic"
+descriptions that lack real-world knowledge, e.g., named entities and
+contextual information. Considering that Vision-Language Pre-Training (VLP)
+models master massive such knowledge from large-scale web-harvested data, it is
+promising to utilize the generalizability of VLP models to incorporate
+knowledge into image descriptions. However, using VLP models faces challenges:
+zero-shot inference suffers from knowledge hallucination that leads to
+low-quality descriptions, but the generic bias in downstream task fine-tuning
+hinders the VLP model from expressing knowledge. To address these concerns, we
+propose a simple yet effective method called Knowledge-guided Replay
+(K-Replay), which enables the retention of pre-training knowledge during
+fine-tuning. Our approach consists of two parts: (1) a knowledge prediction
+task on automatically collected replay exemplars to continuously awaken the VLP
+model's memory about knowledge, thus preventing the model from collapsing into
+the generic pattern; (2) a knowledge distillation constraint to improve the
+faithfulness of generated descriptions hence alleviating the knowledge
+hallucination. To evaluate knowledge-enhanced descriptions, we construct a
+novel captioning benchmark KnowCap, containing knowledge of landmarks, famous
+brands, special foods and movie characters. Experimental results show that our
+approach effectively incorporates knowledge into descriptions, outperforming
+strong VLP baseline by 20.9 points (78.7->99.6) in CIDEr score and 20.5
+percentage points (34.0%->54.5%) in knowledge recognition accuracy. Our code
+and data is available at https://github.com/njucckevin/KnowCap.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM Multimedia (ACMMM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZRIGF: An Innovative Multimodal Framework for Zero-Resource
+  Image-Grounded Dialogue Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhang, Jian Wang, Hui Ma, Bo Xu, Hongfei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-grounded dialogue systems benefit greatly from integrating visual
+information, resulting in high-quality response generation. However, current
+models struggle to effectively utilize such information in zero-resource
+scenarios, mainly due to the disparity between image and text modalities. To
+overcome this challenge, we propose an innovative multimodal framework, called
+ZRIGF, which assimilates image-grounded information for dialogue generation in
+zero-resource situations. ZRIGF implements a two-stage learning strategy,
+comprising contrastive pre-training and generative pre-training. Contrastive
+pre-training includes a text-image matching module that maps images and texts
+into a unified encoded vector space, along with a text-assisted masked image
+modeling module that preserves pre-training visual features and fosters further
+multimodal feature alignment. Generative pre-training employs a multimodal
+fusion module and an information transfer module to produce insightful
+responses based on harmonized multimodal representations. Comprehensive
+experiments conducted on both text-based and image-grounded dialogue datasets
+demonstrate ZRIGF's efficacy in generating contextually pertinent and
+informative responses. Furthermore, we adopt a fully zero-resource scenario in
+the image-grounded dialogue dataset to demonstrate our framework's robust
+generalization capabilities in novel domains. The code is available at
+https://github.com/zhangbo-nlp/ZRIGF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023 Accpeted, Repo:
+  https://github.com/zhangbo-nlp/ZRIGF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Efficient Recommendation System in E-commerce using Passer learning
+  optimization based on Bi-LSTM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00137v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00137v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hemn Barzan Abdalla, Awder Ahmed, Bahtiyar Mehmed, Mehdi Gheisari, Maryam Cheraghy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation system services have become crucial for users to access
+personalized goods or services as the global e-commerce market expands. They
+can increase business sales growth and lower the cost of user information
+exploration. Recent years have seen a signifi-cant increase in researchers
+actively using user reviews to solve standard recommender system research
+issues. Reviews may, however, contain information that does not help consumers
+de-cide what to buy, such as advertising or fictitious or fake reviews. Using
+such reviews to offer suggestion services may reduce the effectiveness of those
+recommendations. In this research, the recommendation in e-commerce is
+developed using passer learning optimization based on Bi-LSTM to solve that
+issue (PL optimized Bi-LSTM). Data is first obtained from the product
+recommendation dataset and pre-processed to remove any values that are missing
+or incon-sistent. Then, feature extraction is performed using TF-IDF features
+and features that support graph embedding. Before submitting numerous features
+with the same dimensions to the Bi-LSTM classifier for analysis, they are
+integrated using the feature concatenation approach. The Collaborative Bi-LSTM
+method employs these features to determine if the model is a recommended
+product. The PL optimization approach, which efficiently adjusts the
+classifier's parameters and produces an extract output that measures the
+f1-score, MSE, precision, and recall, is the basis of this research's
+contributions. As compared to earlier methods, the pro-posed PL-optimized
+Bi-LSTM achieved values of 88.58%, 1.24%, 92.69%, and 92.69% for dataset 1,
+88.46%, 0.48%, 92.43%, and 93.47% for dataset 2, and 92.51%, 1.58%, 91.90%, and
+90.76% for dataset 3.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-08-01T00:00:00Z">2023-08-01</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">51</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CodeBPE: Investigating Subtokenization Options for Large Language Model
+  <span class="highlight-title">Pretrain</span>ing on Source Code <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadezhda Chirkova, Sergey Troshin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works have widely adopted large language model pretraining for source
+code, suggested source code-specific pretraining objectives and investigated
+the applicability of various Transformer-based language model architectures for
+source code. This work investigates another important aspect of such models,
+namely the effect of different subtokenization options, and aims at identifying
+most effective and length-efficient subtokenizations, taking into account code
+specifics. We propose subtokenziation that reduces average length by 17%
+without downstream performance drop, and show that a carefully chosen
+subtokenization may improve quality by 0.5-2%, possibly with some length
+increase.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tool Documentation Enables Zero-Shot Tool-Usage with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Yu Hsieh, Si-An Chen, Chun-Liang Li, Yasuhisa Fujii, Alexander Ratner, Chen-Yu Lee, Ranjay Krishna, Tomas Pfister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, large language models (LLMs) are taught to use new tools by providing
+a few demonstrations of the tool's usage. Unfortunately, demonstrations are
+hard to acquire, and can result in undesirable biased usage if the wrong
+demonstration is chosen. Even in the rare scenario that demonstrations are
+readily available, there is no principled selection protocol to determine how
+many and which ones to provide. As tasks grow more complex, the selection
+search grows combinatorially and invariably becomes intractable. Our work
+provides an alternative to demonstrations: tool documentation. We advocate the
+use of tool documentation, descriptions for the individual tool usage, over
+demonstrations. We substantiate our claim through three main empirical findings
+on 6 tasks across both vision and language modalities. First, on existing
+benchmarks, zero-shot prompts with only tool documentation are sufficient for
+eliciting proper tool usage, achieving performance on par with few-shot
+prompts. Second, on a newly collected realistic tool-use dataset with hundreds
+of available tool APIs, we show that tool documentation is significantly more
+valuable than demonstrations, with zero-shot documentation significantly
+outperforming few-shot without documentation. Third, we highlight the benefits
+of tool documentations by tackling image generation and video tracking using
+just-released unseen state-of-the-art models as tools. Finally, we highlight
+the possibility of using tool documentation to automatically enable new
+applications: by using nothing more than the documentation of GroundingDino,
+Stable Diffusion, XMem, and SAM, LLMs can re-invent the functionalities of the
+just-released Grounded-SAM and Track Anything models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JIANG: Chinese Open Foundation Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinhua Duan, Wenchao Gu, Yujia Chen, Wenxin Mao, Zewen Tian, Hui Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancements in large language model technology, it has showcased
+capabilities that come close to those of human beings across various tasks.
+This achievement has garnered significant interest from companies and
+scientific research institutions, leading to substantial investments in the
+research and development of these models. While numerous large models have
+emerged during this period, the majority of them have been trained primarily on
+English data. Although they exhibit decent performance in other languages, such
+as Chinese, their potential remains limited due to factors like vocabulary
+design and training corpus. Consequently, their ability to fully express their
+capabilities in Chinese falls short. To address this issue, we introduce the
+model named JIANG (Chinese pinyin of ginger) specifically designed for the
+Chinese language. We have gathered a substantial amount of Chinese corpus to
+train the model and have also optimized its structure. The extensive
+experimental results demonstrate the excellent performance of our model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unimodal Intermediate Training for Multimodal Meme Sentiment
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muzhaffar Hazman, Susan McKeever, Josephine Griffith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Internet Memes remain a challenging form of user-generated content for
+automated sentiment classification. The availability of labelled memes is a
+barrier to developing sentiment classifiers of multimodal memes. To address the
+shortage of labelled memes, we propose to supplement the training of a
+multimodal meme classifier with unimodal (image-only and text-only) data. In
+this work, we present a novel variant of supervised intermediate training that
+uses relatively abundant sentiment-labelled unimodal data. Our results show a
+statistically significant performance improvement from the incorporation of
+unimodal text data. Furthermore, we show that the training set of labelled
+memes can be reduced by 40% without reducing the performance of the downstream
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for Publication at RANLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval Augmented Generation and Representative Vector Summarization
+  for large unstructured textual data in Medical Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00479v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00479v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. S. Manathunga, Y. A. Illangasekara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models are increasingly being used for various tasks including
+content generation and as chatbots. Despite their impressive performances in
+general tasks, LLMs need to be aligned when applying for domain specific tasks
+to mitigate the problems of hallucination and producing harmful answers.
+Retrieval Augmented Generation (RAG) allows to easily attach and manipulate a
+non-parametric knowledgebases to LLMs. Applications of RAG in the field of
+medical education are discussed in this paper. A combined extractive and
+abstractive summarization method for large unstructured textual data using
+representative vectors is proposed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structural Embeddings of Tools for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eren Unlu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is evident that the current state of Large Language Models (LLMs)
+necessitates the incorporation of external tools. The lack of straightforward
+algebraic and logical reasoning is well documented and prompted researchers to
+develop frameworks which allow LLMs to operate via external tools. The
+ontological nature of tool utilization for a specific task can be well
+formulated with a Directed Acyclic Graph (DAG). The central aim of the paper is
+to highlight the importance of graph based approaches to LLM-tool interaction
+in near future. We propose an exemplary framework to guide the orchestration of
+exponentially increasing numbers of external tools with LLMs,where objectives
+and functionalities of tools are graph encoded hierarchically. Assuming that
+textual segments of a Chain-of-Thought (CoT) can be imagined as a tool as
+defined here, the graph based framework can pave new avenues in that particular
+direction as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Miao, Yee Whye Teh, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in large language models (LLMs), especially the invention
+of chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning
+problems. However, even the strongest LLMs are still struggling with more
+complicated problems that require non-linear thinking and multi-step reasoning.
+In this work, we explore whether LLMs have the ability to recognize their own
+errors, without resorting to external resources. In particular, we investigate
+whether they can be used to identify individual errors within a step-by-step
+reasoning. To this end, we propose a zero-shot verification scheme to recognize
+such errors. We then use this verification scheme to improve question-answering
+performance, by using it to perform weighted voting on different generated
+answers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and
+find that it successfully recognizes errors and, in turn, increases final
+predictive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discourse-Aware Text Simplification: From Complex Sentences to Linked
+  Propositions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christina Niklaus, Matthias Cetto, André Freitas, Siegfried Handschuh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentences that present a complex syntax act as a major stumbling block for
+downstream Natural Language Processing applications whose predictive quality
+deteriorates with sentence length and complexity. The task of Text
+Simplification (TS) may remedy this situation. It aims to modify sentences in
+order to make them easier to process, using a set of rewriting operations, such
+as reordering, deletion, or splitting. State-of-the-art syntactic TS approaches
+suffer from two major drawbacks: first, they follow a very conservative
+approach in that they tend to retain the input rather than transforming it, and
+second, they ignore the cohesive nature of texts, where context spread across
+clauses or sentences is needed to infer the true meaning of a statement. To
+address these problems, we present a discourse-aware TS approach that splits
+and rephrases complex English sentences within the semantic context in which
+they occur. Based on a linguistically grounded transformation stage that uses
+clausal and phrasal disembedding mechanisms, complex sentences are transformed
+into shorter utterances with a simple canonical structure that can be easily
+analyzed by downstream applications. With sentence splitting, we thus address a
+TS task that has hardly been explored so far. Moreover, we introduce the notion
+of minimality in this context, as we aim to decompose source sentences into a
+set of self-contained minimal semantic units. To avoid breaking down the input
+into a disjointed sequence of statements that is difficult to interpret because
+important contextual information is missing, we incorporate the semantic
+context between the split propositions in the form of hierarchical structures
+and semantic relationships. In that way, we generate a semantic hierarchy of
+minimal propositions that leads to a novel representation of complex assertions
+that puts a semantic layer on top of the simplified sentences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZRIGF: An Innovative Multimodal Framework for Zero-Resource
+  Image-Grounded Dialogue Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhang, Jian Wang, Hui Ma, Bo Xu, Hongfei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-grounded dialogue systems benefit greatly from integrating visual
+information, resulting in high-quality response generation. However, current
+models struggle to effectively utilize such information in zero-resource
+scenarios, mainly due to the disparity between image and text modalities. To
+overcome this challenge, we propose an innovative multimodal framework, called
+ZRIGF, which assimilates image-grounded information for dialogue generation in
+zero-resource situations. ZRIGF implements a two-stage learning strategy,
+comprising contrastive pre-training and generative pre-training. Contrastive
+pre-training includes a text-image matching module that maps images and texts
+into a unified encoded vector space, along with a text-assisted masked image
+modeling module that preserves pre-training visual features and fosters further
+multimodal feature alignment. Generative pre-training employs a multimodal
+fusion module and an information transfer module to produce insightful
+responses based on harmonized multimodal representations. Comprehensive
+experiments conducted on both text-based and image-grounded dialogue datasets
+demonstrate ZRIGF's efficacy in generating contextually pertinent and
+informative responses. Furthermore, we adopt a fully zero-resource scenario in
+the image-grounded dialogue dataset to demonstrate our framework's robust
+generalization capabilities in novel domains. The code is available at
+https://github.com/zhangbo-nlp/ZRIGF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023 Accpeted, Repo:
+  https://github.com/zhangbo-nlp/ZRIGF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling Hallucinations in Neural Chart Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Obaid ul Islam, Iza Škrjanec, Ondřej Dušek, Vera Demberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucinations in text generation occur when the system produces text that is
+not grounded in the input. In this work, we tackle the problem of
+hallucinations in neural chart summarization. Our analysis shows that the
+target side of chart summarization training datasets often contains additional
+information, leading to hallucinations. We propose a natural language inference
+(NLI) based method to preprocess the training data and show through human
+evaluation that our method significantly reduces hallucinations. We also found
+that shortening long-distance dependencies in the input sequence and adding
+chart-related information like title and legends improves the overall
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented in INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fountain -- an intelligent contextual assistant combining knowledge
+  representation and language models for manufacturing risk identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saurabh Kumar, Daniel Fuchs, Klaus Spindler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deviations from the approved design or processes during mass production can
+lead to unforeseen risks. However, these changes are sometimes necessary due to
+changes in the product design characteristics or an adaptation in the
+manufacturing process. A major challenge is to identify these risks early in
+the workflow so that failures leading to warranty claims can be avoided. We
+developed Fountain as a contextual assistant integrated in the deviation
+management workflow that helps in identifying the risks based on the
+description of the existing design and process criteria and the proposed
+deviation. In the manufacturing context, it is important that the assistant
+provides recommendations that are explainable and consistent. We achieve this
+through a combination of the following two components 1) language models
+finetuned for domain specific semantic similarity and, 2) knowledge
+representation in the form of a property graph derived from the bill of
+materials, Failure Modes and Effect Analysis (FMEA) and prior failures reported
+by customers. Here, we present the nuances of selecting and adapting pretrained
+language models for an engineering domain, continuous model updates based on
+user interaction with the contextual assistant and creating the causal chain
+for explainable recommendations based on the knowledge representation.
+Additionally, we demonstrate that the model adaptation is feasible using
+moderate computational infrastructure already available to most engineering
+teams in manufacturing organizations and inference can be performed on standard
+CPU only instances for integration with existing applications making these
+methods easily deployable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LimeAttack: Local Explainable Method for Textual Hard-Label Adversarial
+  Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Zhu, Zhaoqing Yang, Weiwei Shang, Yuren Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language processing models are vulnerable to adversarial examples.
+Previous textual adversarial attacks adopt gradients or confidence scores to
+calculate word importance ranking and generate adversarial examples. However,
+this information is unavailable in the real world. Therefore, we focus on a
+more realistic and challenging setting, named hard-label attack, in which the
+attacker can only query the model and obtain a discrete prediction label.
+Existing hard-label attack algorithms tend to initialize adversarial examples
+by random substitution and then utilize complex heuristic algorithms to
+optimize the adversarial perturbation. These methods require a lot of model
+queries and the attack success rate is restricted by adversary initialization.
+In this paper, we propose a novel hard-label attack algorithm named LimeAttack,
+which leverages a local explainable method to approximate word importance
+ranking, and then adopts beam search to find the optimal solution. Extensive
+experiments show that LimeAttack achieves the better attacking performance
+compared with existing hard-label attack under the same query budget. In
+addition, we evaluate the effectiveness of LimeAttack on large language models,
+and results indicate that adversarial examples remain a significant threat to
+large language models. The adversarial examples crafted by LimeAttack are
+highly transferable and effectively improve model robustness in adversarial
+training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skills-in-Context <span class="highlight-title">Prompt</span>ing: Unlocking Compositionality in Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaao Chen, Xiaoman Pan, Dian Yu, Kaiqiang Song, Xiaoyang Wang, Dong Yu, Jianshu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of eliciting compositional generalization
+capabilities in large language models (LLMs) with a novel type of prompting
+strategy. Compositional generalization empowers the LLMs to solve problems that
+are harder than the ones they have seen (i.e., easy-to-hard generalization),
+which is a critical reasoning capability of human-like intelligence. However,
+even the current state-of-the-art LLMs still struggle with this form of
+reasoning. To bridge this gap, we propose skills-in-context (SKiC) prompting,
+which instructs LLMs how to compose basic skills to resolve more complex
+problems. We find that it is crucial to demonstrate both the skills and the
+compositional examples within the same prompting context. With as few as two
+examplars, our SKiC prompting initiates strong synergies between skills and
+their composition capabilities. Notably, it empowers LLMs to solve unseen
+problems that require innovative skill compositions, achieving near-perfect
+generalization on a broad range of challenging compositionality tasks.
+Intriguingly, SKiC prompting unlocks the latent potential of LLMs, enabling
+them to leverage pre-existing internal skills acquired during earlier
+pretraining and alignment stages, even when these skills are not explicitly
+presented in the prompting context. This results in the capability of LLMs to
+solve unseen complex problems by activating and composing these internal
+competencies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making the V in Text-VQA Matter <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamanthak Hegde, Soumya Jahagirdar, Shankar Gangisetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-based VQA aims at answering questions by reading the text present in the
+images. It requires a large amount of scene-text relationship understanding
+compared to the VQA task. Recent studies have shown that the question-answer
+pairs in the dataset are more focused on the text present in the image but less
+importance is given to visual features and some questions do not require
+understanding the image. The models trained on this dataset predict biased
+answers due to the lack of understanding of visual context. For example, in
+questions like "What is written on the signboard?", the answer predicted by the
+model is always "STOP" which makes the model to ignore the image. To address
+these issues, we propose a method to learn visual features (making V matter in
+TextVQA) along with the OCR features and question features using VQA dataset as
+external knowledge for Text-based VQA. Specifically, we combine the TextVQA
+dataset and VQA dataset and train the model on this combined dataset. Such a
+simple, yet effective approach increases the understanding and correlation
+between the image features and text present in the image, which helps in the
+better answering of questions. We further test the model on different datasets
+and compare their qualitative and quantitative results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the CVPR 2023 Workshop on Open-Domain Reasoning Under
+  Multi-Modal Settings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modality Multi-Loss Fusion Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehui Wu, Ziwei Gong, Jaywon Koo, Julia Hirschberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we investigate the optimal selection and fusion of features
+across multiple modalities and combine these in a neural network to improve
+emotion detection. We compare different fusion methods and examine the impact
+of multi-loss training within the multi-modality fusion network, identifying
+useful findings relating to subnet performance. Our best model achieves
+state-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and
+CH-SIMS), and outperforms the other methods in most metrics. We have found that
+training on multimodal features improves single modality testing and designing
+fusion methods based on dataset annotation schema enhances model performance.
+These results suggest a roadmap towards an optimized feature selection and
+fusion approach for enhancing emotion detection in neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally to the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Effective Ancient Chinese Translation: <span class="highlight-title">Dataset</span>, Model, and
+  Evaluation <span class="chip">NLPCC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geyang Guo, Jiarong Yang, Fengyuan Lu, Jiaxin Qin, Tianyi Tang, Wayne Xin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpreting ancient Chinese has been the key to comprehending vast Chinese
+literature, tradition, and civilization. In this paper, we propose Erya for
+ancient Chinese translation. From a dataset perspective, we collect, clean, and
+classify ancient Chinese materials from various sources, forming the most
+extensive ancient Chinese resource to date. From a model perspective, we devise
+Erya training method oriented towards ancient Chinese. We design two
+jointly-working tasks: disyllabic aligned substitution (DAS) and dual masked
+language model (DMLM). From an evaluation perspective, we build a benchmark to
+judge ancient Chinese translation quality in different scenarios and evaluate
+the ancient Chinese translation capacities of various existing models. Our
+model exhibits remarkable zero-shot performance across five domains, with over
++12.0 BLEU against GPT-3.5 models and better human evaluation results than
+ERNIE Bot. Subsequent fine-tuning further shows the superior transfer
+capability of Erya model with +6.2 BLEU gain. We release all the
+above-mentioned resources at https://github.com/RUCAIBox/Erya.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NLPCC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instructed to Bias: Instruction-Tuned Language Models Exhibit Emergent
+  Cognitive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itay Itzhak, Gabriel Stanovsky, Nir Rosenfeld, Yonatan Belinkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies show that instruction tuning and learning from human feedback
+improve the abilities of large language models (LMs) dramatically. While these
+tuning methods can make models generate high-quality text, we conjecture that
+more implicit cognitive biases may arise in these fine-tuned models. Our work
+provides evidence that these fine-tuned models exhibit biases that were absent
+or less pronounced in their pretrained predecessors. We examine the extent of
+this phenomenon in three cognitive biases - the decoy effect, the certainty
+effect, and the belief bias - all of which are known to influence human
+decision-making and reasoning. Our findings highlight the presence of these
+biases in various models, especially those that have undergone instruction
+tuning, such as Flan-T5, GPT3.5, and GPT4. This research constitutes a step
+toward comprehending cognitive biases in instruction-tuned LMs, which is
+crucial for the development of more reliable and unbiased language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Beyond Identification: Multi-bit Watermark for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        KiYoon Yoo, Wonhyuk Ahn, Nojun Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study aims to proactively tackle misuse of large language models beyond
+identification of machine-generated text. While existing methods focus on
+detection, some malicious misuses demand tracing the adversary user for
+counteracting them. To address this, we propose "Multi-bit Watermark through
+Color-listing" (COLOR), embedding traceable multi-bit information during
+language model generation. Leveraging the benefits of zero-bit watermarking
+(Kirchenbauer et al., 2023a), COLOR enables extraction without model access,
+on-the-fly embedding, and maintains text quality, while allowing zero-bit
+detection all at the same time. Preliminary experiments demonstrates successful
+embedding of 32-bit messages with 91.9% accuracy in moderate-length texts
+($\sim$500 tokens). This work advances strategies to counter language model
+misuse effectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiactTOD: Learning Generalizable Latent Dialogue Acts for Controllable
+  Task-Oriented Dialogue Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyang Wu, James Gung, Raphael Shu, Yi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue act annotations are important to improve response generation quality
+in task-oriented dialogue systems. However, it can be challenging to use
+dialogue acts to control response generation in a generalizable way because
+different datasets and tasks may have incompatible annotations. While
+alternative methods that utilize latent action spaces or reinforcement learning
+do not require explicit annotations, they may lack interpretability or face
+difficulties defining task-specific rewards. In this work, we present a novel
+end-to-end latent dialogue act model (DiactTOD) that represents dialogue acts
+in a latent space. DiactTOD, when pre-trained on a large corpus, is able to
+predict and control dialogue acts to generate controllable responses using
+these latent representations in a zero-shot fashion. Our approach demonstrates
+state-of-the-art performance across a wide range of experimental settings on
+the MultiWOZ dataset, including zero-shot, few-shot, and full data fine-tuning
+with both end-to-end and policy optimization configurations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGDial 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRDD: A <span class="highlight-title">Dataset</span> for Greek Dialectal NLP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stergios Chatzikyriakidis, Chatrine Qwaider, Ilias Kolokousis, Christina Koula, Dimitris Papadakis, Efthymia Sakellariou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a dataset for the computational study of a number
+of Modern Greek dialects. It consists of raw text data from four dialects of
+Modern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is
+of considerable size, albeit imbalanced, and presents the first attempt to
+create large scale dialectal resources of this type for Modern Greek dialects.
+We then use the dataset to perform dialect idefntification. We experiment with
+traditional ML algorithms, as well as simple DL architectures. The results show
+very good performance on the task, potentially revealing that the dialects in
+question have distinct enough characteristics allowing even simple ML models to
+perform well on the task. Error analysis is performed for the top performing
+algorithms showing that in a number of cases the errors are due to insufficient
+dataset cleaning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Contrastive <span class="highlight-title">BERT</span> Fine-tuning for Fusion-based
+  <span class="highlight-title">Review</span>ed-Item Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdi Abdollah Pour, Parsa Farinneya, Armin Toroghi, Anton Korikov, Ali Pesaranghader, Touqir Sajed, Manasa Bharadwaj, Borislav Mavrin, Scott Sanner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As natural language interfaces enable users to express increasingly complex
+natural language queries, there is a parallel explosion of user review content
+that can allow users to better find items such as restaurants, books, or movies
+that match these expressive queries. While Neural Information Retrieval (IR)
+methods have provided state-of-the-art results for matching queries to
+documents, they have not been extended to the task of Reviewed-Item Retrieval
+(RIR), where query-review scores must be aggregated (or fused) into item-level
+scores for ranking. In the absence of labeled RIR datasets, we extend Neural IR
+methodology to RIR by leveraging self-supervised methods for contrastive
+learning of BERT embeddings for both queries and reviews. Specifically,
+contrastive learning requires a choice of positive and negative samples, where
+the unique two-level structure of our item-review data combined with meta-data
+affords us a rich structure for the selection of these samples. For contrastive
+learning in a Late Fusion scenario, we investigate the use of positive review
+samples from the same item and/or with the same rating, selection of hard
+positive samples by choosing the least similar reviews from the same anchor
+item, and selection of hard negative samples by choosing the most similar
+reviews from different items. We also explore anchor sub-sampling and
+augmenting with meta-data. For a more end-to-end Early Fusion approach, we
+introduce contrastive item embedding learning to fuse reviews into single item
+embeddings. Experimental results show that Late Fusion contrastive learning for
+Neural RIR outperforms all other contrastive IR configurations, Neural IR, and
+sparse retrieval baselines, thus demonstrating the power of exploiting the
+two-level structure in Neural RIR approaches as well as the importance of
+preserving the nuance of individual review content via Late Fusion methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Bias Amplification Paradox in Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preethi Seshadri, Sameer Singh, Yanai Elazar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias amplification is a phenomenon in which models increase imbalances
+present in the training data. In this paper, we study bias amplification in the
+text-to-image domain using Stable Diffusion by comparing gender ratios in
+training vs. generated images. We find that the model appears to amplify
+gender-occupation biases found in the training data (LAION). However, we
+discover that amplification can largely be attributed to discrepancies between
+training captions and model prompts. For example, an inherent difference is
+that captions from the training data often contain explicit gender information
+while the prompts we use do not, which leads to a distribution shift and
+consequently impacts bias measures. Once we account for various distributional
+differences between texts used for training and generation, we observe that
+amplification decreases considerably. Our findings illustrate the challenges of
+comparing biases in models and the data they are trained on, and highlight
+confounding factors that contribute to bias amplification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChatMOF: An Autonomous AI System for Predicting and Generating
+  Metal-Organic Frameworks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeonghun Kang, Jihan Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatMOF is an autonomous Artificial Intelligence (AI) system that is built to
+predict and generate of metal-organic frameworks (MOFs). By leveraging a
+large-scale language model (gpt-3.5-turbo), ChatMOF extracts key details from
+textual inputs and delivers appropriate responses, thus eliminating the
+necessity for rigid structured queries. The system is comprised of three core
+components (i.e. an agent, a toolkit, and an evaluator) and it forms a robust
+pipeline that manages a variety of tasks, including data retrieval, property
+prediction, and structure generation. The study further explores the merits and
+constraints of using large language models (LLMs) AI system in material
+sciences using and showcases its transformative potential for future
+advancements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Current State of Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04853v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04853v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Retkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the explosive growth of textual information, summarization systems have
+become increasingly important. This work aims to concisely indicate the current
+state of the art in abstractive text summarization. As part of this, we outline
+the current paradigm shifts towards pre-trained encoder-decoder models and
+large autoregressive language models. Additionally, we delve further into the
+challenges of evaluating summarization systems and the potential of
+instruction-tuned models for zero-shot summarization. Finally, we provide a
+brief overview of how summarization systems are currently being integrated into
+commercial applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in "Beyond Quantity: Research with Subsymbolic AI"
+  (11/2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasoning or Reciting? Exploring the Capabilities and Limitations of
+  Language Models Through Counterfactual Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaofeng Wu, Linlu Qiu, Alexis Ross, Ekin Akyürek, Boyuan Chen, Bailin Wang, Najoung Kim, Jacob Andreas, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impressive performance of recent language models across a wide range of
+tasks suggests that they possess a degree of abstract reasoning skills. Are
+these skills general and transferable, or specialized to specific tasks seen
+during pretraining? To disentangle these effects, we propose an evaluation
+framework based on "counterfactual" task variants that deviate from the default
+assumptions underlying standard tasks. Across a suite of 11 tasks, we observe
+nontrivial performance on the counterfactual variants, but nevertheless find
+that performance substantially and consistently degrades compared to the
+default conditions. This suggests that while current LMs may possess abstract
+task-solving skills to a degree, they often also rely on narrow,
+non-transferable procedures for task-solving. These results motivate a more
+careful interpretation of language model performance that teases apart these
+aspects of behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parallel Context Windows for Large Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10947v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10947v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nir Ratner, Yoav Levine, Yonatan Belinkov, Ori Ram, Inbal Magar, Omri Abend, Ehud Karpas, Amnon Shashua, Kevin Leyton-Brown, Yoav Shoham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applied to processing long text, Large Language Models (LLMs) are
+limited by their context window. Existing efforts to address this limitation
+involve training specialized architectures, and cannot be easily applied to
+off-the-shelf LLMs. We present Parallel Context Windows (PCW), a method that
+alleviates the context window restriction for any off-the-shelf LLM without
+further training. The key to the approach is to carve a long context into
+chunks (``windows''), restrict the attention mechanism to apply only within
+each window, and re-use the positional embeddings across the windows. Our main
+results test the PCW approach on in-context learning with models that range in
+size between 750 million and 178 billion parameters, and show substantial
+improvements for tasks with diverse input and output spaces. We show additional
+benefits in other settings where long context windows may be beneficial:
+multi-hop questions and retrieval-augmented question answering with multiple
+retrieved documents. Our results highlight Parallel Context Windows as a
+promising method for applying off-the-shelf LLMs in a range of settings that
+require long text sequences. We make our code publicly available at
+https://github.com/ai21labs/parallel-context-windows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 61st Annual Meeting of the Association for Computational
+  Linguistics (ACL 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Learning Behaviour of In-context Learning: A
+  Comparison with Supervised Learning <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xindi Wang, Yufei Wang, Can Xu, Xiubo Geng, Bowen Zhang, Chongyang Tao, Frank Rudzicz, Robert E. Mercer, Daxin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable capacity for in-context
+learning (ICL), where learning a new task from just a few training examples is
+done without being explicitly pre-trained. However, despite the success of
+LLMs, there has been little understanding of how ICL learns the knowledge from
+the given prompts. In this paper, to make progress toward understanding the
+learning behaviour of ICL, we train the same LLMs with the same demonstration
+examples via ICL and supervised learning (SL), respectively, and investigate
+their performance under label perturbations (i.e., noisy labels and label
+imbalance) on a range of classification tasks. First, via extensive
+experiments, we find that gold labels have significant impacts on the
+downstream in-context performance, especially for large language models;
+however, imbalanced labels matter little to ICL across all model sizes. Second,
+when comparing with SL, we show empirically that ICL is less sensitive to label
+perturbations than SL, and ICL gradually attains comparable performance to SL
+as the model size increases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ECAI 2023 (camera-ready)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Compositionality with Formal Languages <span class="chip">COLING 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.08195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.08195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Valvoda, Naomi Saphra, Jonathan Rawski, Adina Williams, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recombining known primitive concepts into larger novel combinations is a
+quintessentially human cognitive capability. Whether large neural models in NLP
+can acquire this ability while learning from data is an open question. In this
+paper, we investigate this problem from the perspective of formal languages. We
+use deterministic finite-state transducers to make an unbounded number of
+datasets with controllable properties governing compositionality. By randomly
+sampling over many transducers, we explore which of their properties contribute
+to learnability of a compositional relation by a neural network. We find that
+the models either learn the relations completely or not at all. The key is
+transition coverage, setting a soft learnability limit at 400 examples per
+transition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at COLING 2022. This version fixes a mistake in Figure 4
+  and adds a clarifying note in teal. Code is available at
+  https://github.com/valvoda/neuralTransducer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ mCPT at SemEval-2023 Task 3: Multilingual Label-Aware Contrastive
+  <span class="highlight-title">Pre-Train</span>ing of <span class="highlight-title">Transformer</span>s for Few- and Zero-shot Framing Detection <span class="chip">SemEval'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09901v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09901v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Reiter-Haas, Alexander Ertl, Kevin Innerebner, Elisabeth Lex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the winning system for the zero-shot Spanish framing
+detection task, which also achieves competitive places in eight additional
+languages. The challenge of the framing detection task lies in identifying a
+set of 14 frames when only a few or zero samples are available, i.e., a
+multilingual multi-label few- or zero-shot setting. Our developed solution
+employs a pre-training procedure based on multilingual Transformers using a
+label-aware contrastive loss function. In addition to describing the system, we
+perform an embedding space analysis and ablation study to demonstrate how our
+pre-training procedure supports framing detection to advance computational
+framing analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at SemEval'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Depression Detection Using Digital Traces on Social Media: A
+  Knowledge-aware Deep Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenli Zhang, Jiaheng Xie, Zhu Zhang, Xiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depression is a common disease worldwide. It is difficult to diagnose and
+continues to be underdiagnosed. Because depressed patients constantly share
+their symptoms, major life events, and treatments on social media, researchers
+are turning to user-generated digital traces on social media for depression
+detection. Such methods have distinct advantages in combating depression
+because they can facilitate innovative approaches to fight depression and
+alleviate its social and economic burden. However, most existing studies lack
+effective means to incorporate established medical domain knowledge in
+depression detection or suffer from feature extraction difficulties that impede
+greater performance. Following the design science research paradigm, we propose
+a Deep Knowledge-aware Depression Detection (DKDD) framework to accurately
+detect social media users at risk of depression and explain the critical
+factors that contribute to such detection. Extensive empirical studies with
+real-world data demonstrate that, by incorporating domain knowledge, our method
+outperforms existing state-of-the-art methods. Our work has significant
+implications for IS research in knowledge-aware machine learning, digital
+traces utilization, and NLP research in IS. Practically, by providing early
+detection and explaining the critical factors, DKDD can supplement clinical
+depression screening and enable large-scale evaluations of a population's
+mental health status.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at INFORMS 2022 Data Science Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How is Chat<span class="highlight-title">GPT</span>'s behavior changing over time? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09009v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09009v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingjiao Chen, Matei Zaharia, James Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GPT-3.5 and GPT-4 are the two most widely used large language model (LLM)
+services. However, when and how these models are updated over time is opaque.
+Here, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on
+several diverse tasks: 1) math problems, 2) sensitive/dangerous questions, 3)
+opinion surveys, 4) multi-hop knowledge-intensive questions, 5) generating
+code, 6) US Medical License tests, and 7) visual reasoning. We find that the
+performance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time.
+For example, GPT-4 (March 2023) was reasonable at identifying prime vs.
+composite numbers (84% accuracy) but GPT-4 (June 2023) was poor on these same
+questions (51% accuracy). This is partly explained by a drop in GPT-4's amenity
+to follow chain-of-thought prompting. Interestingly, GPT-3.5 was much better in
+June than in March in this task. GPT-4 became less willing to answer sensitive
+questions and opinion survey questions in June than in March. GPT-4 performed
+better at multi-hop questions in June than in March, while GPT-3.5's
+performance dropped on this task. Both GPT-4 and GPT-3.5 had more formatting
+mistakes in code generation in June than in March. Overall, our findings show
+that the behavior of the "same" LLM service can change substantially in a
+relatively short amount of time, highlighting the need for continuous
+monitoring of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>add more evaluations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Günther, Louis Milliken, Jonathan Geuter, Georgios Mastrapas, Bo Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jina Embeddings constitutes a set of high-performance sentence embedding
+models adept at translating various textual inputs into numerical
+representations, thereby capturing the semantic essence of the text. The models
+excel in applications such as dense retrieval and semantic textual similarity.
+This paper details the development of Jina Embeddings, starting with the
+creation of high-quality pairwise and triplet datasets. It underlines the
+crucial role of data cleaning in dataset preparation, gives in-depth insights
+into the model training process, and concludes with a comprehensive performance
+evaluation using the Massive Textual Embedding Benchmark (MTEB). To increase
+the model's awareness of negations, we constructed a novel training and
+evaluation dataset of negated and non-negated statements, which we make
+publicly available to the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 page appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Retrieval-Augmented Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00083v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00083v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ori Ram, Yoav Levine, Itay Dalmedigos, Dor Muhlgay, Amnon Shashua, Kevin Leyton-Brown, Yoav Shoham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Language Modeling (RALM) methods, which condition a
+language model (LM) on relevant documents from a grounding corpus during
+generation, were shown to significantly improve language modeling performance.
+In addition, they can mitigate the problem of factually inaccurate text
+generation and provide natural source attribution mechanism. Existing RALM
+approaches focus on modifying the LM architecture in order to facilitate the
+incorporation of external information, significantly complicating deployment.
+This paper considers a simple alternative, which we dub In-Context RALM:
+leaving the LM architecture unchanged and prepending grounding documents to the
+input, without any further training of the LM. We show that In-Context RALM
+that builds on off-the-shelf general purpose retrievers provides surprisingly
+large LM gains across model sizes and diverse corpora. We also demonstrate that
+the document retrieval and ranking mechanism can be specialized to the RALM
+setting to further boost performance. We conclude that In-Context RALM has
+considerable potential to increase the prevalence of LM grounding, particularly
+in settings where a pretrained LM must be used without modification or even via
+API access.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Transactions of the Association for
+  Computational Linguistics (TACL). pre-MIT Press publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diable: Efficient Dialogue State Tracking as Operations on Tables <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Lesci, Yoshinari Fujinuma, Momchil Hardalov, Chao Shang, Lluis Marquez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequence-to-sequence state-of-the-art systems for dialogue state tracking
+(DST) use the full dialogue history as input, represent the current state as a
+list with all the slots, and generate the entire state from scratch at each
+dialogue turn. This approach is inefficient, especially when the number of
+slots is large and the conversation is long. We propose Diable, a new task
+formalisation that simplifies the design and implementation of efficient DST
+systems and allows one to easily plug and play large language models. We
+represent the dialogue state as a table and formalise DST as a table
+manipulation task. At each turn, the system updates the previous state by
+generating table operations based on the dialogue context. Extensive
+experimentation on the MultiWoz datasets demonstrates that Diable (i)
+outperforms strong efficient DST baselines, (ii) is 2.4x more time efficient
+than current state-of-the-art methods while retaining competitive Joint Goal
+Accuracy, and (iii) is robust to noisy data annotations due to the table
+operations approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2023 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Multimodal Knowledge Graph Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Ningyu Zhang, Jintian Zhang, Xiaohan Wang, Tongtong Wu, Xi Chen, Yongheng Wang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Knowledge Graph Construction (MKGC) involves creating structured
+representations of entities and relations using multiple modalities, such as
+text and images. However, existing MKGC models face challenges in handling the
+addition of new entities and relations in dynamic real-world scenarios. The
+current continual setting for knowledge graph construction mainly focuses on
+entity and relation extraction from text data, overlooking other multimodal
+sources. Therefore, there arises the need to explore the challenge of continual
+MKGC to address the phenomenon of catastrophic forgetting and ensure the
+retention of past knowledge extracted from different forms of data. This
+research focuses on investigating this complex topic by developing lifelong
+MKGC benchmark datasets. Based on the empirical findings that several typical
+MKGC models, when trained on multimedia data, might unexpectedly underperform
+compared to those solely utilizing textual resources in a continual setting, we
+propose a Lifelong MultiModal Consistent Transformer Framework (LMC) for
+continual MKGC, which plays the strengths of the consistent multimodal
+optimization in continual learning and leads to a better stability-plasticity
+trade-off. Our experiments demonstrate the superior performance of our method
+over prevailing continual learning techniques or multimodal approaches in
+dynamic scenarios. Code and datasets can be found at
+https://github.com/zjunlp/ContinueMKGC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat with the Environment: Interactive Multimodal Perception Using Large
+  Language Models <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08268v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08268v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xufeng Zhao, Mengdi Li, Cornelius Weber, Muhammad Burhan Hafez, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programming robot behavior in a complex world faces challenges on multiple
+levels, from dextrous low-level skills to high-level planning and reasoning.
+Recent pre-trained Large Language Models (LLMs) have shown remarkable reasoning
+ability in few-shot robotic planning. However, it remains challenging to ground
+LLMs in multimodal sensory input and continuous action output, while enabling a
+robot to interact with its environment and acquire novel information as its
+policies unfold. We develop a robot interaction scenario with a partially
+observable state, which necessitates a robot to decide on a range of epistemic
+actions in order to sample sensory information among multiple modalities,
+before being able to execute the task correctly. An interactive perception
+framework is therefore proposed with an LLM as its backbone, whose ability is
+exploited to instruct epistemic actions and to reason over the resulting
+multimodal sensations (vision, sound, haptics, proprioception), as well as to
+plan an entire task execution based on the interactively acquired information.
+Our study demonstrates that LLMs can provide high-level planning and reasoning
+skills and control interactive robot behavior in a multimodal environment,
+while multimodal modules with the context of the environmental state help
+ground the LLMs and extend their processing ability. The project website can be
+found at
+\href{https://matcha-model.github.io}{\textcolor{blue}{https://matcha-model.github.io/}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS2023, Detroit. See the project website at
+  https://matcha-model.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Dive into the Language of International Relations: NLP-based
+  Analysis of UNESCO's Summary Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16573v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16573v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joanna Wojciechowska, Mateusz Sypniewski, Maria Śmigielska, Igor Kamiński, Emilia Wiśnios, Hanna Schreiber, Bartosz Pieliński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cultural heritage is an arena of international relations that interests all
+states worldwide. The inscription process on the UNESCO World Heritage List and
+the UNESCO Representative List of the Intangible Cultural Heritage of Humanity
+often leads to tensions and conflicts among states. This research addresses
+these challenges by developing automatic tools that provide valuable insights
+into the decision-making processes regarding inscriptions to the two lists
+mentioned above. We propose innovative topic modelling and tension detection
+methods based on UNESCO's summary records. Our analysis achieved a commendable
+accuracy rate of 72% in identifying tensions. Furthermore, we have developed an
+application tailored for diplomats, lawyers, political scientists, and
+international relations researchers that facilitates the efficient search of
+paragraphs from selected documents and statements from specific speakers about
+chosen topics. This application is a valuable resource for enhancing the
+understanding of complex decision-making dynamics within international heritage
+inscription procedures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for 3rd Workshop on Computational Linguistics for the
+  Political and Social Sciences at KONVENS 2023 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gzip versus bag-of-words for text classification with KNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15002v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15002v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juri Opitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of compression distance in KNN-based text classification
+('gzip') has recently garnered lots of attention. In this note we show that
+simpler means can also be effective, and compression may not be needed. Indeed,
+a 'bag-of-words' matching can achieve similar or better results, and is more
+efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>improved figure display, added more results, fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HL <span class="highlight-title">Dataset</span>: Visually-grounded Description of Scenes, Actions and
+  Rationales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Cafagna, Kees van Deemter, Albert Gatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current captioning datasets focus on object-centric captions, describing the
+visible objects in the image, e.g. "people eating food in a park". Although
+these datasets are useful to evaluate the ability of Vision & Language models
+to recognize and describe visual content, they do not support controlled
+experiments involving model testing or fine-tuning, with more high-level
+captions, which humans find easy and natural to produce. For example, people
+often describe images based on the type of scene they depict ('people at a
+holiday resort') and the actions they perform ('people having a picnic'). Such
+descriptions draw on personal experience and commonsense assumptions. We
+present the High-Level Dataset a dataset extending 14997 images from the COCO
+dataset, aligned with a new set of 134,973 human-annotated (high-level)
+captions collected along three axes: scenes, actions, and rationales. We
+further extend this dataset with confidence scores collected from an
+independent set of readers, as well as a set of narrative captions generated
+synthetically, by combining each of the three axes. We describe this dataset
+and analyse it extensively. We also present baseline results for the High-Level
+Captioning task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fundamental Limitations of Alignment in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11082v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11082v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yotam Wolf, Noam Wies, Oshri Avnery, Yoav Levine, Amnon Shashua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An important aspect in developing language models that interact with humans
+is aligning their behavior to be useful and unharmful for their human users.
+This is usually achieved by tuning the model in a way that enhances desired
+behaviors and inhibits undesired ones, a process referred to as alignment. In
+this paper, we propose a theoretical approach called Behavior Expectation
+Bounds (BEB) which allows us to formally investigate several inherent
+characteristics and limitations of alignment in large language models.
+Importantly, we prove that for any behavior that has a finite probability of
+being exhibited by the model, there exist prompts that can trigger the model
+into outputting this behavior, with probability that increases with the length
+of the prompt. This implies that any alignment process that attenuates
+undesired behavior but does not remove it altogether, is not safe against
+adversarial prompting attacks. Furthermore, our framework hints at the
+mechanism by which leading alignment approaches such as reinforcement learning
+from human feedback increase the LLM's proneness to being prompted into the
+undesired behaviors. Moreover, we include the notion of personas in our BEB
+framework, and find that behaviors which are generally very unlikely to be
+exhibited by the model can be brought to the front by prompting the model to
+behave as specific persona. This theoretical result is being experimentally
+demonstrated in large scale by the so called contemporary "chatGPT jailbreaks",
+where adversarial users trick the LLM into breaking its alignment guardrails by
+triggering it into acting as a malicious persona. Our results expose
+fundamental limitations in alignment of LLMs and bring to the forefront the
+need to devise reliable mechanisms for ensuring AI safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-tuning Happens in Tiny Subspaces: Exploring Intrinsic Task-specific
+  Subspaces of <span class="highlight-title">Pre-train</span>ed Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Zhang, Bang Liu, Junming Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLMs) are known to be overly parameterized and
+have significant redundancy, indicating a small degree of freedom of the PLMs.
+Motivated by the observation, in this paper, we study the problem of
+re-parameterizing and fine-tuning PLMs from a new perspective: Discovery of
+intrinsic task-specific subspace. Specifically, by exploiting the dynamics of
+the fine-tuning process for a given task, the parameter optimization trajectory
+is learned to uncover its intrinsic task-specific subspace. A key finding is
+that PLMs can be effectively fine-tuned in the subspace with a small number of
+free parameters. Beyond, we observe some outlier dimensions emerging during
+fine-tuning in the subspace. Disabling these dimensions degrades the model
+performance significantly. This suggests that these dimensions are crucial to
+induce task-specific knowledge to downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023 (main conference, long paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Multimodal Sentiment Analysis based on Multimodal Probabilistic
+  Fusion <span class="highlight-title">Prompt</span>s <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06607v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06607v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocui Yang, Shi Feng, Daling Wang, Pengfei Hong, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal sentiment analysis has gained significant attention due to the
+proliferation of multimodal content on social media. However, existing studies
+in this area rely heavily on large-scale supervised data, which is
+time-consuming and labor-intensive to collect. Thus, there is a need to address
+the challenge of few-shot multimodal sentiment analysis. To tackle this
+problem, we propose a novel method called Multimodal Probabilistic Fusion
+Prompts (MultiPoint) that leverages diverse cues from different modalities for
+multimodal sentiment detection in the few-shot scenario. Specifically, we start
+by introducing a Consistently Distributed Sampling approach called CDS, which
+ensures that the few-shot dataset has the same category distribution as the
+full dataset. Unlike previous approaches primarily using prompts based on the
+text modality, we design unified multimodal prompts to reduce discrepancies
+between different modalities and dynamically incorporate multimodal
+demonstrations into the context of each multimodal instance. To enhance the
+model's robustness, we introduce a probabilistic fusion method to fuse output
+predictions from multiple diverse prompts for each input. Our extensive
+experiments on six datasets demonstrate the effectiveness of our approach.
+First, our method outperforms strong baselines in the multimodal few-shot
+setting. Furthermore, under the same amount of data (1% of the full dataset),
+our CDS-based experimental results significantly outperform those based on
+previously sampled datasets constructed from the same number of instances of
+each class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures, 7 tables. It has been accepted ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlpacaFarm: A Simulation Framework for Methods that Learn from Human
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14387v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14387v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yann Dubois, Xuechen Li, Rohan Taori, Tianyi Zhang, Ishaan Gulrajani, Jimmy Ba, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as ChatGPT have seen widespread adoption
+due to their ability to follow user instructions well. Developing these LLMs
+involves a complex yet poorly understood workflow requiring training with human
+feedback. Replicating and understanding this instruction-following process
+faces three major challenges: the high cost of data collection, the lack of
+trustworthy evaluation, and the absence of reference method implementations. We
+address these challenges with AlpacaFarm, a simulator that enables research and
+development for learning from feedback at a low cost. First, we design LLM
+prompts to simulate human feedback that are 45x cheaper than crowdworkers and
+display high agreement with humans. Second, we propose an automatic evaluation
+and validate it against human instructions obtained on real-world interactions.
+Third, we contribute reference implementations for several methods (PPO,
+best-of-n, expert iteration, and more) that learn from pairwise feedback.
+Finally, as an end-to-end validation of AlpacaFarm, we train and evaluate
+eleven models on 10k pairs of real human feedback and show that rankings of
+models trained in AlpacaFarm match rankings of models trained on human data. As
+a demonstration of the research possible in AlpacaFarm, we find that methods
+that use a reward model can substantially improve over supervised fine-tuning
+and that our reference PPO implementation leads to a +10% improvement in
+win-rate against Davinci003. We release all components of AlpacaFarm at
+https://github.com/tatsu-lab/alpaca_farm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emotion<span class="highlight-title">Prompt</span>: Leveraging Psychology for Large Language Models
+  Enhancement via Emotional Stimulus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11760v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11760v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Li, Jindong Wang, Kaijie Zhu, Yixuan Zhang, Wenxin Hou, Jianxun Lian, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved significant performance in many
+fields such as reasoning, language understanding, and math problem-solving, and
+are regarded as a crucial step to artificial general intelligence (AGI).
+However, the sensitivity of LLMs to prompts remains a major bottleneck for
+their daily adoption. In this paper, we take inspiration from psychology and
+propose EmotionPrompt to explore emotional intelligence to enhance the
+performance of LLMs. EmotionPrompt operates on a remarkably straightforward
+principle: the incorporation of emotional stimulus into prompts. Experimental
+results demonstrate that our EmotionPrompt, using the same single prompt
+templates, significantly outperforms original zero-shot prompt and
+Zero-shot-CoT on 8 tasks with diverse models: ChatGPT, Vicuna-13b, Bloom, and
+T5. Further, EmotionPrompt was observed to improve both truthfulness and
+informativeness. We believe that EmotionPrompt heralds a novel avenue for
+exploring interdisciplinary knowledge for humans-LLMs interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ W-procer: Weighted Prototypical Contrastive Learning for Medical
+  Few-Shot Named Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18624v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18624v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingchen Li, Yang Ye, Jeremy Yeung, Huixue Zhou, Huaiyuan Chu, Rui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has become a popular solution for few-shot Name Entity
+Recognization (NER). The conventional configuration strives to reduce the
+distance between tokens with the same labels and increase the distance between
+tokens with different labels. The effect of this setup may, however, in the
+medical domain, there are a lot of entities annotated as OUTSIDE (O), and they
+are undesirably pushed apart to other entities that are not labeled as OUTSIDE
+(O) by the current contrastive learning method end up with a noisy prototype
+for the semantic representation of the label, though there are many OUTSIDE (O)
+labeled entities are relevant to the labeled entities. To address this
+challenge, we propose a novel method named Weighted Prototypical Contrastive
+Learning for Medical Few Shot Named Entity Recognization (W-PROCER). Our
+approach primarily revolves around constructing the prototype-based contractive
+loss and weighting network. These components play a crucial role in assisting
+the model in differentiating the negative samples from OUTSIDE (O) tokens and
+enhancing the discrimination ability of contrastive learning. Experimental
+results show that our proposed W-PROCER framework significantly outperforms the
+strong baselines on the three medical benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative AI for Programming Education: Benchmarking Chat<span class="highlight-title">GPT</span>, <span class="highlight-title">GPT</span>-4,
+  and Human Tutors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17156v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17156v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tung Phung, Victor-Alexandru Pădurean, José Cambronero, Sumit Gulwani, Tobias Kohn, Rupak Majumdar, Adish Singla, Gustavo Soares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI and large language models hold great promise in enhancing
+computing education by powering next-generation educational technologies for
+introductory programming. Recent works have studied these models for different
+scenarios relevant to programming education; however, these works are limited
+for several reasons, as they typically consider already outdated models or only
+specific scenario(s). Consequently, there is a lack of a systematic study that
+benchmarks state-of-the-art models for a comprehensive set of programming
+education scenarios. In our work, we systematically evaluate two models,
+ChatGPT (based on GPT-3.5) and GPT-4, and compare their performance with human
+tutors for a variety of scenarios. We evaluate using five introductory Python
+programming problems and real-world buggy programs from an online platform, and
+assess performance using expert-based annotations. Our results show that GPT-4
+drastically outperforms ChatGPT (based on GPT-3.5) and comes close to human
+tutors' performance for several scenarios. These results also highlight
+settings where GPT-4 still struggles, providing exciting future directions on
+developing techniques to improve the performance of these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is a full version of the poster (extended abstract) from
+  ICER'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Syntactic Surprisal From Neural Models Predicts, But Underestimates,
+  Human Processing Difficulty From Syntactic Ambiguities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12187v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12187v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suhas Arehalli, Brian Dillon, Tal Linzen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans exhibit garden path effects: When reading sentences that are
+temporarily structurally ambiguous, they slow down when the structure is
+disambiguated in favor of the less preferred alternative. Surprisal theory
+(Hale, 2001; Levy, 2008), a prominent explanation of this finding, proposes
+that these slowdowns are due to the unpredictability of each of the words that
+occur in these sentences. Challenging this hypothesis, van Schijndel & Linzen
+(2021) find that estimates of the cost of word predictability derived from
+language models severely underestimate the magnitude of human garden path
+effects. In this work, we consider whether this underestimation is due to the
+fact that humans weight syntactic factors in their predictions more highly than
+language models do. We propose a method for estimating syntactic predictability
+from a language model, allowing us to weigh the cost of lexical and syntactic
+predictability independently. We find that treating syntactic predictability
+independently from lexical predictability indeed results in larger estimates of
+garden path. At the same time, even when syntactic predictability is
+independently weighted, surprisal still greatly underestimate the magnitude of
+human garden path effects. Our results support the hypothesis that
+predictability is not the only factor responsible for the processing cost
+associated with garden path sentences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages (4 references + appendix), 6 figures. To appear in the
+  proceedings of the 2022 SIGNLL Conference on Computational Natural Language
+  Learning. Revised after fixing errors in computing syntactic surprisal. The
+  fix resulted in an increase in the NPZ GP effect observed and no evidence for
+  a correlation between syntactic surprisal and word frequency. The main
+  findings are unchanged</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Small Language Models on PubMedQA via Generative Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07804v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07804v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Guo, Peiqi Wang, Yanwei Wang, Shangdi Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made remarkable advancements in the field
+of natural language processing. However, their increasing size poses challenges
+in terms of computational cost. On the other hand, Small Language Models (SLMs)
+are known for their efficiency, but they often struggle with limited capacity
+and training data, especially in specific domains. In this paper, we introduce
+a novel method aimed at improving SLMs in the medical domain using LLM-based
+generative data augmentation. The objective of our approach is to develop more
+efficient and capable models that are specifically tailored for specialized
+applications. Through experiments conducted on the PubMedQA dataset, we
+demonstrate the effectiveness of LLMs in refining and diversifying existing
+question-answer pairs. This refinement process leads to improved performance in
+a significantly smaller model after fine-tuning. Notably, our best SLM, with
+under 1.6 billion parameters, outperforms the few-shot GPT-4 on the PubMedQA
+dataset. Our code and generated data are publicly available to facilitate
+further explorations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mass-Editing Memory in a <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07229v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07229v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Meng, Arnab Sen Sharma, Alex Andonian, Yonatan Belinkov, David Bau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown exciting promise in updating large language models with
+new memories, so as to replace obsolete information or add specialized
+knowledge. However, this line of work is predominantly limited to updating
+single associations. We develop MEMIT, a method for directly updating a
+language model with many memories, demonstrating experimentally that it can
+scale up to thousands of associations for GPT-J (6B) and GPT-NeoX (20B),
+exceeding prior work by orders of magnitude. Our code and data are at
+https://memit.baulab.info.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures. Code and data at https://memit.baulab.info</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Design of Semantic Similarity Ensembles Using Grammatical
+  Evolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00925v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00925v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic similarity measures are widely used in natural language processing
+to catalyze various computer-related tasks. However, no single semantic
+similarity measure is the most appropriate for all tasks, and researchers often
+use ensemble strategies to ensure performance. This research work proposes a
+method for automatically designing semantic similarity ensembles. In fact, our
+proposed method uses grammatical evolution, for the first time, to
+automatically select and aggregate measures from a pool of candidates to create
+an ensemble that maximizes correlation to human judgment. The method is
+evaluated on several benchmark datasets and compared to state-of-the-art
+ensembles, showing that it can significantly improve similarity assessment
+accuracy and outperform existing methods in some cases. As a result, our
+research demonstrates the potential of using grammatical evolution to
+automatically compare text and prove the benefits of using ensembles for
+semantic similarity tasks. The source code that illustrates our approach can be
+downloaded from https://github.com/jorge-martinez-gil/sesige.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">118</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LISA: Reasoning Segmentation via Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lai, Zhuotao Tian, Yukang Chen, Yanwei Li, Yuhui Yuan, Shu Liu, Jiaya Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although perception systems have made remarkable advancements in recent
+years, they still rely on explicit human instruction to identify the target
+objects or categories before executing visual recognition tasks. Such systems
+lack the ability to actively reason and comprehend implicit user intentions. In
+this work, we propose a new segmentation task -- reasoning segmentation. The
+task is designed to output a segmentation mask given a complex and implicit
+query text. Furthermore, we establish a benchmark comprising over one thousand
+image-instruction pairs, incorporating intricate reasoning and world knowledge
+for evaluation purposes. Finally, we present LISA: large Language Instructed
+Segmentation Assistant, which inherits the language generation capabilities of
+the multi-modal Large Language Model (LLM) while also possessing the ability to
+produce segmentation masks. We expand the original vocabulary with a <SEG>
+token and propose the embedding-as-mask paradigm to unlock the segmentation
+capability. Remarkably, LISA can handle cases involving: 1) complex reasoning;
+2) world knowledge; 3) explanatory answers; 4) multi-turn conversation. Also,
+it demonstrates robust zero-shot capability when trained exclusively on
+reasoning-free datasets. In addition, fine-tuning the model with merely 239
+reasoning segmentation image-instruction pairs results in further performance
+enhancement. Experiments show our method not only unlocks new reasoning
+segmentation capabilities but also proves effective in both complex reasoning
+segmentation and standard referring segmentation tasks. Code, models, and demo
+are at https://github.com/dvlab-research/LISA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, models, and demo are at https://github.com/dvlab-research/LISA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AnyLoc: Towards Universal Visual Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Keetha, Avneesh Mishra, Jay Karhade, Krishna Murthy Jatavallabhula, Sebastian Scherer, Madhava Krishna, Sourav Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Place Recognition (VPR) is vital for robot localization. To date, the
+most performant VPR approaches are environment- and task-specific: while they
+exhibit strong performance in structured environments (predominantly urban
+driving), their performance degrades severely in unstructured environments,
+rendering most approaches brittle to robust real-world deployment. In this
+work, we develop a universal solution to VPR -- a technique that works across a
+broad range of structured and unstructured environments (urban, outdoors,
+indoors, aerial, underwater, and subterranean environments) without any
+re-training or fine-tuning. We demonstrate that general-purpose feature
+representations derived from off-the-shelf self-supervised models with no
+VPR-specific training are the right substrate upon which to build such a
+universal VPR solution. Combining these derived features with unsupervised
+feature aggregation enables our suite of methods, AnyLoc, to achieve up to 4X
+significantly higher performance than existing approaches. We further obtain a
+6% improvement in performance by characterizing the semantic properties of
+these features, uncovering unique domains which encapsulate datasets from
+similar environments. Our detailed experiments and analysis lay a foundation
+for building VPR solutions that may be deployed anywhere, anytime, and across
+anyview. We encourage the readers to explore our project page and interactive
+demos: https://anyloc.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applicability of scaling laws to vision encoding models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuya Matsuyama, Kota S Sasaki, Shinji Nishimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigated how to build a high-performance vision
+encoding model to predict brain activity as part of our participation in the
+Algonauts Project 2023 Challenge. The challenge provided brain activity
+recorded by functional MRI (fMRI) while participants viewed images. Several
+vision models with parameter sizes ranging from 86M to 4.3B were used to build
+predictive models. To build highly accurate models, we focused our analysis on
+two main aspects: (1) How does the sample size of the fMRI training set change
+the prediction accuracy? (2) How does the prediction accuracy across the visual
+cortex vary with the parameter size of the vision models? The results show that
+as the sample size used during training increases, the prediction accuracy
+improves according to the scaling law. Similarly, we found that as the
+parameter size of the vision models increases, the prediction accuracy improves
+according to the scaling law. These results suggest that increasing the sample
+size of the fMRI training set and the parameter size of visual models may
+contribute to more accurate visual models of the brain and lead to a better
+understanding of visual neuroscience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tool Documentation Enables Zero-Shot Tool-Usage with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Yu Hsieh, Si-An Chen, Chun-Liang Li, Yasuhisa Fujii, Alexander Ratner, Chen-Yu Lee, Ranjay Krishna, Tomas Pfister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, large language models (LLMs) are taught to use new tools by providing
+a few demonstrations of the tool's usage. Unfortunately, demonstrations are
+hard to acquire, and can result in undesirable biased usage if the wrong
+demonstration is chosen. Even in the rare scenario that demonstrations are
+readily available, there is no principled selection protocol to determine how
+many and which ones to provide. As tasks grow more complex, the selection
+search grows combinatorially and invariably becomes intractable. Our work
+provides an alternative to demonstrations: tool documentation. We advocate the
+use of tool documentation, descriptions for the individual tool usage, over
+demonstrations. We substantiate our claim through three main empirical findings
+on 6 tasks across both vision and language modalities. First, on existing
+benchmarks, zero-shot prompts with only tool documentation are sufficient for
+eliciting proper tool usage, achieving performance on par with few-shot
+prompts. Second, on a newly collected realistic tool-use dataset with hundreds
+of available tool APIs, we show that tool documentation is significantly more
+valuable than demonstrations, with zero-shot documentation significantly
+outperforming few-shot without documentation. Third, we highlight the benefits
+of tool documentations by tackling image generation and video tracking using
+just-released unseen state-of-the-art models as tools. Finally, we highlight
+the possibility of using tool documentation to automatically enable new
+applications: by using nothing more than the documentation of GroundingDino,
+Stable Diffusion, XMem, and SAM, LLMs can re-invent the functionalities of the
+just-released Grounded-SAM and Track Anything models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Zero-shot Character Recognition: A Gold Standard <span class="highlight-title">Dataset</span> with
+  Radical-level Annotations <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaolei Diao, Daqian Shi, Jian Li, Lida Shi, Mingzhe Yue, Ruihua Qi, Chuntao Li, Hao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical character recognition (OCR) methods have been applied to diverse
+tasks, e.g., street view text recognition and document analysis. Recently,
+zero-shot OCR has piqued the interest of the research community because it
+considers a practical OCR scenario with unbalanced data distribution. However,
+there is a lack of benchmarks for evaluating such zero-shot methods that apply
+a divide-and-conquer recognition strategy by decomposing characters into
+radicals. Meanwhile, radical recognition, as another important OCR task, also
+lacks radical-level annotation for model training. In this paper, we construct
+an ancient Chinese character image dataset that contains both radical-level and
+character-level annotations to satisfy the requirements of the above-mentioned
+methods, namely, ACCID, where radical-level annotations include radical
+categories, radical locations, and structural relations. To increase the
+adaptability of ACCID, we propose a splicing-based synthetic character
+algorithm to augment the training samples and apply an image denoising method
+to improve the image quality. By introducing character decomposition and
+recombination, we propose a baseline method for zero-shot OCR. The experimental
+results demonstrate the validity of ACCID and the baseline model quantitatively
+and qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-M3: A Multi-view Multi-modal <span class="highlight-title">Dataset</span> for 3D Human Pose Estimation
+  in Outdoor Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Fan, Siqi Wang, Wenzhao Zheng, Jianjiang Feng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation in outdoor environments has garnered increasing
+attention recently. However, prevalent 3D human pose datasets pertaining to
+outdoor scenes lack diversity, as they predominantly utilize only one type of
+modality (RGB image or pointcloud), and often feature only one individual
+within each scene. This limited scope of dataset infrastructure considerably
+hinders the variability of available data. In this article, we propose
+Human-M3, an outdoor multi-modal multi-view multi-person human pose database
+which includes not only multi-view RGB videos of outdoor scenes but also
+corresponding pointclouds. In order to obtain accurate human poses, we propose
+an algorithm based on multi-modal data input to generate ground truth
+annotation. This benefits from robust pointcloud detection and tracking, which
+solves the problem of inaccurate human localization and matching ambiguity that
+may exist in previous multi-view RGB videos in outdoor multi-person scenes, and
+generates reliable ground truth annotations. Evaluation of multiple different
+modalities algorithms has shown that this database is challenging and suitable
+for future research. Furthermore, we propose a 3D human pose estimation
+algorithm based on multi-modal data input, which demonstrates the advantages of
+multi-modal data input for 3D human pose estimation. Code and data will be
+released on https://github.com/soullessrobot/Human-M3-Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data will be released on
+  https://github.com/soullessrobot/Human-M3-Dataset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRT: Implicit Neural Representations for General Unsupervised
+  Turbulence Mitigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyun Jiang, Vivek Boominathan, Ashok Veeraraghavan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The atmospheric and water turbulence mitigation problems have emerged as
+challenging inverse problems in computer vision and optics communities over the
+years. However, current methods either rely heavily on the quality of the
+training dataset or fail to generalize over various scenarios, such as static
+scenes, dynamic scenes, and text reconstructions. We propose a general implicit
+neural representation for unsupervised atmospheric and water turbulence
+mitigation (NeRT). NeRT leverages the implicit neural representations and the
+physically correct tilt-then-blur turbulence model to reconstruct the clean,
+undistorted image, given only dozens of distorted input images. Moreover, we
+show that NeRT outperforms the state-of-the-art through various qualitative and
+quantitative evaluations of atmospheric and water turbulence datasets.
+Furthermore, we demonstrate the ability of NeRT to eliminate uncontrolled
+turbulence from real-world environments. Lastly, we incorporate NeRT into
+continuously captured video sequences and demonstrate $48 \times$ speedup.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Cost-Sensitive Deep Neural Networks for Brain Tumor
+  Detection from Brain MRI Images considering Data Imbalance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Tanvir Rouf Shawon, G. M. Shahariar Shibli, Farzad Ahmed, Sajib Kumar Saha Joy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a research study on the use of Convolutional Neural
+Network (CNN), ResNet50, InceptionV3, EfficientNetB0 and NASNetMobile models to
+efficiently detect brain tumors in order to reduce the time required for manual
+review of the report and create an automated system for classifying brain
+tumors. An automated pipeline is proposed, which encompasses five models: CNN,
+ResNet50, InceptionV3, EfficientNetB0 and NASNetMobile. The performance of the
+proposed architecture is evaluated on a balanced dataset and found to yield an
+accuracy of 99.33% for fine-tuned InceptionV3 model. Furthermore, Explainable
+AI approaches are incorporated to visualize the model's latent behavior in
+order to understand its black box behavior. To further optimize the training
+process, a cost-sensitive neural network approach has been proposed in order to
+work with imbalanced datasets which has achieved almost 4% more accuracy than
+the conventional models used in our experiments. The cost-sensitive InceptionV3
+(CS-InceptionV3) and CNN (CS-CNN) show a promising accuracy of 92.31% and a
+recall value of 1.00 respectively on an imbalanced dataset. The proposed models
+have shown great potential in improving tumor detection accuracy and must be
+further developed for application in practical solutions. We have provided the
+datasets and made our implementations publicly available at -
+https://github.com/shahariar-shibli/Explainable-Cost-Sensitive-Deep-Neural-Networks-for-Brain-Tumor-Detection-from-Brain-MRI-Images
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond One-Hot-Encoding: Injecting Semantics to Drive Image Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan Perotti, Simone Bertolotto, Eliana Pastor, André Panisson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images are loaded with semantic information that pertains to real-world
+ontologies: dog breeds share mammalian similarities, food pictures are often
+depicted in domestic environments, and so on. However, when training machine
+learning models for image classification, the relative similarities amongst
+object classes are commonly paired with one-hot-encoded labels. According to
+this logic, if an image is labelled as 'spoon', then 'tea-spoon' and 'shark'
+are equally wrong in terms of training loss. To overcome this limitation, we
+explore the integration of additional goals that reflect ontological and
+semantic knowledge, improving model interpretability and trustworthiness. We
+suggest a generic approach that allows to derive an additional loss term
+starting from any kind of semantic information about the classification label.
+First, we show how to apply our approach to ontologies and word embeddings, and
+discuss how the resulting information can drive a supervised learning process.
+Second, we use our semantically enriched loss to train image classifiers, and
+analyse the trade-offs between accuracy, mistake severity, and learned internal
+representations. Finally, we discuss how this approach can be further exploited
+in terms of explainability and adversarial robustness. Code repository:
+https://github.com/S1M0N38/semantic-encodings
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted to be presented to The 1st World
+  Conference on eXplainable Artificial Intelligence (xAI 2023), July 26-28,
+  2023 - Lisboa, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MonoNext: A 3D Monocular Object Detection with ConvNext 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcelo Eduardo Pederiva, José Mario De Martino, Alessandro Zimmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving perception tasks rely heavily on cameras as the primary
+sensor for Object Detection, Semantic Segmentation, Instance Segmentation, and
+Object Tracking. However, RGB images captured by cameras lack depth
+information, which poses a significant challenge in 3D detection tasks. To
+supplement this missing data, mapping sensors such as LIDAR and RADAR are used
+for accurate 3D Object Detection. Despite their significant accuracy, the
+multi-sensor models are expensive and require a high computational demand. In
+contrast, Monocular 3D Object Detection models are becoming increasingly
+popular, offering a faster, cheaper, and easier-to-implement solution for 3D
+detections. This paper introduces a different Multi-Tasking Learning approach
+called MonoNext that utilizes a spatial grid to map objects in the scene.
+MonoNext employs a straightforward approach based on the ConvNext network and
+requires only 3D bounding box annotated data. In our experiments with the KITTI
+dataset, MonoNext achieved high precision and competitive performance
+comparable with state-of-the-art approaches. Furthermore, by adding more
+training data, MonoNext surpassed itself and achieved higher accuracies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visibility Enhancement for Low-light Hazy Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqun Zhuang, Yunfei Liu, Sijia Wen, Feng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light hazy scenes commonly appear at dusk and early morning. The visual
+enhancement for low-light hazy images is an ill-posed problem. Even though
+numerous methods have been proposed for image dehazing and low-light
+enhancement respectively, simply integrating them cannot deliver pleasing
+results for this particular task. In this paper, we present a novel method to
+enhance visibility for low-light hazy scenarios. To handle this challenging
+task, we propose two key techniques, namely cross-consistency
+dehazing-enhancement framework and physically based simulation for low-light
+hazy dataset. Specifically, the framework is designed for enhancing visibility
+of the input image via fully utilizing the clues from different sub-tasks. The
+simulation is designed for generating the dataset with ground-truths by the
+proposed low-light hazy imaging model. The extensive experimental results show
+that the proposed method outperforms the SOTA solutions on different metrics
+including SSIM (9.19%) and PSNR(5.03%). In addition, we conduct a user study on
+real images to demonstrate the effectiveness and necessity of the proposed
+method by human visual perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relation-Aware Distribution Representation Network for Person Clustering
+  with Multiple Modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijian Liu, Shixiang Tang, Ziyue Li, Zhishuai Li, Lei Bai, Feng Zhu, Rui Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person clustering with multi-modal clues, including faces, bodies, and
+voices, is critical for various tasks, such as movie parsing and identity-based
+movie editing. Related methods such as multi-view clustering mainly project
+multi-modal features into a joint feature space. However, multi-modal clue
+features are usually rather weakly correlated due to the semantic gap from the
+modality-specific uniqueness. As a result, these methods are not suitable for
+person clustering. In this paper, we propose a Relation-Aware Distribution
+representation Network (RAD-Net) to generate a distribution representation for
+multi-modal clues. The distribution representation of a clue is a vector
+consisting of the relation between this clue and all other clues from all
+modalities, thus being modality agnostic and good for person clustering.
+Accordingly, we introduce a graph-based method to construct distribution
+representation and employ a cyclic update policy to refine distribution
+representation progressively. Our method achieves substantial improvements of
++6% and +8.2% in F-score on the Video Person-Clustering Dataset (VPCD) and
+VoxCeleb2 multi-view clustering dataset, respectively. Codes will be released
+publicly upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Multimedia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PVG: Progressive Vision Graph for Vision Recognition <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiafu Wu, Jian Li, Jiangning Zhang, Boshen Zhang, Mingmin Chi, Yabiao Wang, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolution-based and Transformer-based vision backbone networks process
+images into the grid or sequence structures, respectively, which are inflexible
+for capturing irregular objects. Though Vision GNN (ViG) adopts graph-level
+features for complex images, it has some issues, such as inaccurate neighbor
+node selection, expensive node information aggregation calculation, and
+over-smoothing in the deep layers. To address the above problems, we propose a
+Progressive Vision Graph (PVG) architecture for vision recognition task.
+Compared with previous works, PVG contains three main components: 1)
+Progressively Separated Graph Construction (PSGC) to introduce second-order
+similarity by gradually increasing the channel of the global graph branch and
+decreasing the channel of local branch as the layer deepens; 2) Neighbor nodes
+information aggregation and update module by using Max pooling and mathematical
+Expectation (MaxE) to aggregate rich neighbor information; 3) Graph error
+Linear Unit (GraphLU) to enhance low-value information in a relaxed form to
+reduce the compression of image detail information for alleviating the
+over-smoothing. Extensive experiments on mainstream benchmarks demonstrate the
+superiority of PVG over state-of-the-art methods, e.g., our PVG-S obtains 83.0%
+Top-1 accuracy on ImageNet-1K that surpasses GNN-based ViG-S by +0.9 with the
+parameters reduced by 18.5%, while the largest PVG-B obtains 84.2% that has
++0.5 improvement than ViG-B. Furthermore, our PVG-S obtains +1.3 box AP and
++0.4 mask AP gains than ViG-S on COCO dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Copula for Instance-wise Feature Selection and Ranking <span class="chip">UAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyu Peng, Guanhua Fang, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instance-wise feature selection and ranking methods can achieve a good
+selection of task-friendly features for each sample in the context of neural
+networks. However, existing approaches that assume feature subsets to be
+independent are imperfect when considering the dependency between features. To
+address this limitation, we propose to incorporate the Gaussian copula, a
+powerful mathematical technique for capturing correlations between variables,
+into the current feature selection framework with no additional changes needed.
+Experimental results on both synthetic and real datasets, in terms of
+performance comparison and interpretability, demonstrate that our method is
+capable of capturing meaningful correlations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, UAI poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Cloud Presence in Satellite Images Using the RGB-based CLIP
+  Vision-Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikolaj Czerkawski, Robert Atkinson, Christos Tachtatzis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work explores capabilities of the pre-trained CLIP vision-language model
+to identify satellite images affected by clouds. Several approaches to using
+the model to perform cloud presence detection are proposed and evaluated,
+including a purely zero-shot operation with text prompts and several
+fine-tuning approaches. Furthermore, the transferability of the methods across
+different datasets and sensor types (Sentinel-2 and Landsat-8) is tested. The
+results that CLIP can achieve non-trivial performance on the cloud presence
+detection task with apparent capability to generalise across sensing modalities
+and sensing bands. It is also found that a low-cost fine-tuning stage leads to
+a strong increase in true negative rate. The results demonstrate that the
+representations learned by the CLIP model can be useful for satellite image
+processing tasks involving clouds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PressureTransferNet: Human Attribute Guided Dynamic Ground Pressure
+  Profile Transfer using 3D simulated Pressure Maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lala Shakti Swarup Ray, Vitor Fortes Rey, Bo Zhou, Sungho Suh, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose PressureTransferNet, a novel method for Human Activity Recognition
+(HAR) using ground pressure information. Our approach generates body-specific
+dynamic ground pressure profiles for specific activities by leveraging existing
+pressure data from different individuals. PressureTransferNet is an
+encoder-decoder model taking a source pressure map and a target human attribute
+vector as inputs, producing a new pressure map reflecting the target attribute.
+To train the model, we use a sensor simulation to create a diverse dataset with
+various human attributes and pressure profiles. Evaluation on a real-world
+dataset shows its effectiveness in accurately transferring human attributes to
+ground pressure profiles across different scenarios. We visually confirm the
+fidelity of the synthesized pressure shapes using a physics-based deep learning
+model and achieve a binary R-square value of 0.79 on areas with ground contact.
+Validation through classification with F1 score (0.911$\pm$0.015) on physical
+pressure mat data demonstrates the correctness of the synthesized pressure
+maps, making our method valuable for data augmentation, denoising, sensor
+simulation, and anomaly detection. Applications span sports science,
+rehabilitation, and bio-mechanics, contributing to the development of HAR
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Activity and Behavior Computing 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual attention information can be traced on cortical response but not
+  on the retina: evidence from electrophysiological mouse data using natural
+  images as stimuli 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikos Melanitis, Konstantina Nikita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual attention forms the basis of understanding the visual world. In this
+work we follow a computational approach to investigate the biological basis of
+visual attention. We analyze retinal and cortical electrophysiological data
+from mouse. Visual Stimuli are Natural Images depicting real world scenes. Our
+results show that in primary visual cortex (V1), a subset of around $10\%$ of
+the neurons responds differently to salient versus non-salient visual regions.
+Visual attention information was not traced in retinal response. It appears
+that the retina remains naive concerning visual attention; cortical response
+gets modulated to interpret visual attention information. Experimental animal
+studies may be designed to further explore the biological basis of visual
+attention we traced in this study. In applied and translational science, our
+study contributes to the design of improved visual prostheses systems --
+systems that create artificial visual percepts to visually impaired individuals
+by electronic implants placed on either the retina or the cortex.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer-Ensemble Learning based Deep Convolutional Neural Networks for
+  Diabetic Retinopathy Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susmita Ghosh, Abhiroop Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article aims to classify diabetic retinopathy (DR) disease into five
+different classes using an ensemble approach based on two popular pre-trained
+convolutional neural networks: VGG16 and Inception V3. The proposed model aims
+to leverage the strengths of the two individual nets to enhance the
+classification performance for diabetic retinopathy. The ensemble model
+architecture involves freezing a portion of the layers in each pre-trained
+model to utilize their learned representations effectively. Global average
+pooling layers are added to transform the output feature maps into fixed-length
+vectors. These vectors are then concatenated to form a consolidated
+representation of the input image. The ensemble model is trained using a
+dataset of diabetic retinopathy images (APTOS), divided into training and
+validation sets. During the training process, the model learns to classify the
+retinal images into the corresponding diabetic retinopathy classes.
+Experimental results on the test set demonstrate the efficacy of the proposed
+ensemble model for DR classification achieving an accuracy of 96.4%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NormKD: Normalized Logits for Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Chi, Tu Zheng, Hengjia Li, Zheng Yang, Boxi Wu, Binbin Lin, Deng Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Logit based knowledge distillation gets less attention in recent years since
+feature based methods perform better in most cases. Nevertheless, we find it
+still has untapped potential when we re-investigate the temperature, which is a
+crucial hyper-parameter to soften the logit outputs. For most of the previous
+works, it was set as a fixed value for the entire distillation procedure.
+However, as the logits from different samples are distributed quite variously,
+it is not feasible to soften all of them to an equal degree by just a single
+temperature, which may make the previous work transfer the knowledge of each
+sample inadequately. In this paper, we restudy the hyper-parameter temperature
+and figure out its incapability to distill the knowledge from each sample
+sufficiently when it is a single value. To address this issue, we propose
+Normalized Knowledge Distillation (NormKD), with the purpose of customizing the
+temperature for each sample according to the characteristic of the sample's
+logit distribution. Compared to the vanilla KD, NormKD barely has extra
+computation or storage cost but performs significantly better on CIRAR-100 and
+ImageNet for image classification. Furthermore, NormKD can be easily applied to
+the other logit based methods and achieve better performance which can be
+closer to or even better than the feature based method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Markerless human pose estimation for biomedical applications: a <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Avogaro, Federico Cunico, Bodo Rosenhahn, Francesco Setti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Markerless Human Pose Estimation (HPE) proved its potential to support
+decision making and assessment in many fields of application. HPE is often
+preferred to traditional marker-based Motion Capture systems due to the ease of
+setup, portability, and affordable cost of the technology. However, the
+exploitation of HPE in biomedical applications is still under investigation.
+This review aims to provide an overview of current biomedical applications of
+HPE. In this paper, we examine the main features of HPE approaches and discuss
+whether or not those features are of interest to biomedical applications. We
+also identify those areas where HPE is already in use and present peculiarities
+and trends followed by researchers and practitioners. We include here 25
+approaches to HPE and more than 40 studies of HPE applied to motor development
+assessment, neuromuscolar rehabilitation, and gait & posture analysis. We
+conclude that markerless HPE offers great potential for extending diagnosis and
+rehabilitation outside hospitals and clinics, toward the paradigm of remote
+medical care.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relational Contrastive Learning for Scene Text Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinglei Zhang, Tiancheng Lin, Yi Xu, Kai Chen, Rui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context-aware methods achieved great success in supervised scene text
+recognition via incorporating semantic priors from words. We argue that such
+prior contextual information can be interpreted as the relations of textual
+primitives due to the heterogeneous text and background, which can provide
+effective self-supervised labels for representation learning. However, textual
+relations are restricted to the finite size of dataset due to lexical
+dependencies, which causes the problem of over-fitting and compromises
+representation robustness. To this end, we propose to enrich the textual
+relations via rearrangement, hierarchy and interaction, and design a unified
+framework called RCLSTR: Relational Contrastive Learning for Scene Text
+Recognition. Based on causality, we theoretically explain that three modules
+suppress the bias caused by the contextual prior and thus guarantee
+representation robustness. Experiments on representation quality show that our
+method outperforms state-of-the-art self-supervised STR methods. Code is
+available at https://github.com/ThunderVVV/RCLSTR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT
+  by Integrating Neural Distance and Texture-Aware <span class="highlight-title">Transformer</span> <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hexin Dong, Jiawen Yao, Yuxing Tang, Mingze Yuan, Yingda Xia, Jian Zhou, Hong Lu, Jingren Zhou, Bin Dong, Le Lu, Li Zhang, Zaiyi Liu, Yu Shi, Ling Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which
+the tumor-vascular involvement greatly affects the resectability and, thus,
+overall survival of patients. However, current prognostic prediction methods
+fail to explicitly and accurately investigate relationships between the tumor
+and nearby important vessels. This paper proposes a novel learnable neural
+distance that describes the precise relationship between the tumor and vessels
+in CT images of different patients, adopting it as a major feature for
+prognosis prediction. Besides, different from existing models that used CNNs or
+LSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT
+imaging, we improved the extraction of dynamic tumor-related texture features
+in multi-phase contrast-enhanced CT by fusing local and global features using
+CNN and transformer modules, further enhancing the features extracted across
+multi-phase CT images. We extensively evaluated and compared the proposed
+method with existing methods in the multi-center (n=4) dataset with 1,070
+patients with PDAC, and statistical analysis confirmed its clinical
+effectiveness in the external test set consisting of three centers. The
+developed risk marker was the strongest predictor of overall survival among
+preoperative factors and it has the potential to be combined with established
+clinical factors to select patients at higher risk who might benefit from
+neoadjuvant therapy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An L2-Normalized Spatial Attention Network For Accurate And Fast
+  Classification Of Brain Tumors In 2D T1-Weighted CE-MRI Images <span class="chip">ICIP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grace Billingsley, Julia Dietlmeier, Vivek Narayanaswamy, Andreas Spanias, Noel E. OConnor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an accurate and fast classification network for classification of
+brain tumors in MRI images that outperforms all lightweight methods
+investigated in terms of accuracy. We test our model on a challenging 2D
+T1-weighted CE-MRI dataset containing three types of brain tumors: Meningioma,
+Glioma and Pituitary. We introduce an l2-normalized spatial attention mechanism
+that acts as a regularizer against overfitting during training. We compare our
+results against the state-of-the-art on this dataset and show that by
+integrating l2-normalized spatial attention into a baseline network we achieve
+a performance gain of 1.79 percentage points. Even better accuracy can be
+attained by combining our model in an ensemble with the pretrained VGG16 at the
+expense of execution speed. Our code is publicly available at
+https://github.com/juliadietlmeier/MRI_image_classification
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to be published in: IEEE International Conference on Image
+  Processing (ICIP), Kuala Lumpur October 8-11, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DINO-CXR: A self supervised method based on vision <span class="highlight-title">transformer</span> for chest
+  X-ray classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Shakouri, Fatemeh Iranmanesh, Mahdi Eftekhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The limited availability of labeled chest X-ray datasets is a significant
+bottleneck in the development of medical imaging methods. Self-supervised
+learning (SSL) can mitigate this problem by training models on unlabeled data.
+Furthermore, self-supervised pretraining has yielded promising results in
+visual recognition of natural images but has not been given much consideration
+in medical image analysis. In this work, we propose a self-supervised method,
+DINO-CXR, which is a novel adaptation of a self-supervised method, DINO, based
+on a vision transformer for chest X-ray classification. A comparative analysis
+is performed to show the effectiveness of the proposed method for both
+pneumonia and COVID-19 detection. Through a quantitative analysis, it is also
+shown that the proposed method outperforms state-of-the-art methods in terms of
+accuracy and achieves comparable results in terms of AUC and F-1 score while
+requiring significantly less labeled data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Last Layer Re-Training Truly Sufficient for Robustness to Spurious
+  Correlations? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00473v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00473v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phuong Quynh Le, Jörg Schlötterer, Christin Seifert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models trained with empirical risk minimization (ERM) are known to learn to
+rely on spurious features, i.e., their prediction is based on undesired
+auxiliary features which are strongly correlated with class labels but lack
+causal reasoning. This behavior particularly degrades accuracy in groups of
+samples of the correlated class that are missing the spurious feature or
+samples of the opposite class but with the spurious feature present. The
+recently proposed Deep Feature Reweighting (DFR) method improves accuracy of
+these worst groups. Based on the main argument that ERM mods can learn core
+features sufficiently well, DFR only needs to retrain the last layer of the
+classification model with a small group-balanced data set. In this work, we
+examine the applicability of DFR to realistic data in the medical domain.
+Furthermore, we investigate the reasoning behind the effectiveness of
+last-layer retraining and show that even though DFR has the potential to
+improve the accuracy of the worst group, it remains susceptible to spurious
+correlations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Learning Approach for Virtual Contrast Enhancement in Contrast
+  Enhanced Spectral Mammography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00471v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00471v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aurora Rofena, Valerio Guarrasi, Marina Sarli, Claudia Lucia Piccolo, Matteo Sammarra, Bruno Beomonte Zobel, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrast Enhanced Spectral Mammography (CESM) is a dual-energy mammographic
+imaging technique that first needs intravenously administration of an iodinated
+contrast medium; then, it collects bot a low-energy image, comparable to
+standard mammography, and a high-energy image. The two scans are then combined
+to get a recombined image showing contrast enhancement. Despite CESM diagnostic
+advantages for breast cancer diagnosis, the use of contrast medium can cause
+side effects, and CESM also beams patients with a higher radiation dose
+compared to standard mammography. To address these limitations this work
+proposes to use deep generative models for virtual contrast enhancement on
+CESM, aiming to make the CESM contrast-free as well as to reduce the radiation
+dose. Our deep networks, consisting of an autoencoder and two Generative
+Adversarial Networks, the Pix2Pix, and the CycleGAN, generate synthetic
+recombined images solely from low-energy images. We perform an extensive
+quantitative and qualitative analysis of the model's performance, also
+exploiting radiologists' assessments, on a novel CESM dataset that includes
+1138 images that, as a further contribution of this work, we make publicly
+available. The results show that CycleGAN is the most promising deep network to
+generate synthetic recombined images, highlighting the potential of artificial
+intelligence techniques for virtual contrast enhancement in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Satellite Imagery <span class="highlight-title">Dataset</span> for Long-Term Sustainable Development in
+  United States Cities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanxin Xi, Yu Liu, Tong Li, Jintao Ding, Yunke Zhang, Sasu Tarkoma, Yong Li, Pan Hui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cities play an important role in achieving sustainable development goals
+(SDGs) to promote economic growth and meet social needs. Especially satellite
+imagery is a potential data source for studying sustainable urban development.
+However, a comprehensive dataset in the United States (U.S.) covering multiple
+cities, multiple years, multiple scales, and multiple indicators for SDG
+monitoring is lacking. To support the research on SDGs in U.S. cities, we
+develop a satellite imagery dataset using deep learning models for five SDGs
+containing 25 sustainable development indicators. The proposed dataset covers
+the 100 most populated U.S. cities and corresponding Census Block Groups from
+2014 to 2023. Specifically, we collect satellite imagery and identify objects
+with state-of-the-art object detection and semantic segmentation models to
+observe cities' bird's-eye view. We further gather population, nighttime light,
+survey, and built environment data to depict SDGs regarding poverty, health,
+education, inequality, and living environment. We anticipate the dataset to
+help urban policymakers and researchers to advance SDGs-related studies,
+especially applying satellite imagery to monitor long-term and multi-scale SDGs
+in cities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Center Contrastive Loss for Metric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bolun Cai, Pengfei Xiong, Shangxuan Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning is a major studied topic in metric learning. However,
+sampling effective contrastive pairs remains a challenge due to factors such as
+limited batch size, imbalanced data distribution, and the risk of overfitting.
+In this paper, we propose a novel metric learning function called Center
+Contrastive Loss, which maintains a class-wise center bank and compares the
+category centers with the query data points using a contrastive loss. The
+center bank is updated in real-time to boost model convergence without the need
+for well-designed sample mining. The category centers are well-optimized
+classification proxies to re-balance the supervisory signal of each class.
+Furthermore, the proposed loss combines the advantages of both contrastive and
+classification methods by reducing intra-class variations and enhancing
+inter-class differences to improve the discriminative power of embeddings. Our
+experimental results, as shown in Figure 1, demonstrate that a standard network
+(ResNet50) trained with our loss achieves state-of-the-art performance and
+faster convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViT2EEG: Leveraging Hybrid <span class="highlight-title">Pretrain</span>ed Vision <span class="highlight-title">Transformer</span>s for EEG Data <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00454v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00454v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Yang, Eric Modesitt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we demonstrate the application of a hybrid Vision Transformer
+(ViT) model, pretrained on ImageNet, on an electroencephalogram (EEG)
+regression task. Despite being originally trained for image classification
+tasks, when fine-tuned on EEG data, this model shows a notable increase in
+performance compared to other models, including an identical architecture ViT
+trained without the ImageNet weights. This discovery challenges the traditional
+understanding of model generalization, suggesting that Transformer models
+pretrained on seemingly unrelated image data can provide valuable priors for
+EEG regression tasks with an appropriate fine-tuning pipeline.
+  The success of this approach suggests that the features extracted by ViT
+models in the context of visual tasks can be readily transformed for the
+purpose of EEG predictive modeling. We recommend utilizing this methodology not
+only in neuroscience and related fields, but generally for any task where data
+collection is limited by practical, financial, or ethical constraints. Our
+results illuminate the potential of pretrained models on tasks that are clearly
+distinct from their original purpose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 for article, 1 for citation, 1 for appendix. Accepted to
+  KDD-UC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Majority Invariant Approach to Patch Robustness Certification for Deep
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qilin Zhou, Zhengyuan Wei, Haipeng Wang, W. K. Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Patch robustness certification ensures no patch within a given bound on a
+sample can manipulate a deep learning model to predict a different label.
+However, existing techniques cannot certify samples that cannot meet their
+strict bars at the classifier or patch region levels. This paper proposes
+MajorCert. MajorCert firstly finds all possible label sets manipulatable by the
+same patch region on the same sample across the underlying classifiers, then
+enumerates their combinations element-wise, and finally checks whether the
+majority invariant of all these combinations is intact to certify samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Driven Spectrum-Consistent Federated Learning for Palmprint
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyuan Yang, Andrew Beng Jin Teoh, Bob Zhang, Lu Leng, Yi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Palmprint as biometrics has gained increasing attention recently due to its
+discriminative ability and robustness. However, existing methods mainly improve
+palmprint verification within one spectrum, which is challenging to verify
+across different spectrums. Additionally, in distributed server-client-based
+deployment, palmprint verification systems predominantly necessitate clients to
+transmit private data for model training on the centralized server, thereby
+engendering privacy apprehensions. To alleviate the above issues, in this
+paper, we propose a physics-driven spectrum-consistent federated learning
+method for palmprint verification, dubbed as PSFed-Palm. PSFed-Palm draws upon
+the inherent physical properties of distinct wavelength spectrums, wherein
+images acquired under similar wavelengths display heightened resemblances. Our
+approach first partitions clients into short- and long-spectrum groups
+according to the wavelength range of their local spectrum images. Subsequently,
+we introduce anchor models for short- and long-spectrum, which constrain the
+optimization directions of local models associated with long- and
+short-spectrum images. Specifically, a spectrum-consistent loss that enforces
+the model parameters and feature representation to align with their
+corresponding anchor models is designed. Finally, we impose constraints on the
+local models to ensure their consistency with the global model, effectively
+preventing model drift. This measure guarantees spectrum consistency while
+protecting data privacy, as there is no need to share local data. Extensive
+experiments are conducted to validate the efficacy of our proposed PSFed-Palm
+approach. The proposed PSFed-Palm demonstrates compelling performance despite
+only a limited number of training data. The codes will be released at
+https://github.com/Zi-YuanYang/PSFed-Palm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLatten <span class="highlight-title">Transformer</span>: Vision <span class="highlight-title">Transformer</span> using Focused Linear Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongchen Han, Xuran Pan, Yizeng Han, Shiji Song, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The quadratic computation complexity of self-attention has been a persistent
+challenge when applying Transformer models to vision tasks. Linear attention,
+on the other hand, offers a much more efficient alternative with its linear
+complexity by approximating the Softmax operation through carefully designed
+mapping functions. However, current linear attention approaches either suffer
+from significant performance degradation or introduce additional computation
+overhead from the mapping functions. In this paper, we propose a novel Focused
+Linear Attention module to achieve both high efficiency and expressiveness.
+Specifically, we first analyze the factors contributing to the performance
+degradation of linear attention from two perspectives: the focus ability and
+feature diversity. To overcome these limitations, we introduce a simple yet
+effective mapping function and an efficient rank restoration module to enhance
+the expressiveness of self-attention while maintaining low computation
+complexity. Extensive experiments show that our linear attention module is
+applicable to a variety of advanced vision Transformers, and achieves
+consistently improved performances on multiple benchmarks. Code is available at
+https://github.com/LeapLabTHU/FLatten-Transformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patch-wise Auto-Encoder for Visual Anomaly Detection <span class="chip">ICIP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yajie Cui, Zhaoxiang Liu, Shiguo Lian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection without priors of the anomalies is challenging. In the
+field of unsupervised anomaly detection, traditional auto-encoder (AE) tends to
+fail based on the assumption that by training only on normal images, the model
+will not be able to reconstruct abnormal images correctly. On the contrary, we
+propose a novel patch-wise auto-encoder (Patch AE) framework, which aims at
+enhancing the reconstruction ability of AE to anomalies instead of weakening
+it. Each patch of image is reconstructed by corresponding spatially distributed
+feature vector of the learned feature representation, i.e., patch-wise
+reconstruction, which ensures anomaly-sensitivity of AE. Our method is simple
+and efficient. It advances the state-of-the-art performances on Mvtec AD
+benchmark, which proves the effectiveness of our model. It shows great
+potential in practical industrial application scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICIP2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiscale Global and Regional Feature Learning Using Co-Tuplet Loss for
+  Offline Handwritten Signature Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fu-Hsien Huang, Hsin-Min Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handwritten signature verification is a significant biometric verification
+method widely acknowledged by legal and financial institutions. However, the
+development of automatic signature verification systems poses challenges due to
+inter-writer similarity, intra-writer variations, and the limited number of
+signature samples. To address these challenges, we propose a multiscale global
+and regional feature learning network (MGRNet) with the co-tuplet loss, a new
+metric learning loss, for offline handwritten signature verification. MGRNet
+jointly learns global and regional information from various spatial scales and
+integrates it to generate discriminative features. Consequently, it can capture
+overall signature stroke information while detecting detailed local differences
+between genuine and skilled-forged signatures. To enhance the discriminative
+capability of our network further, we propose the co-tuplet loss, which
+simultaneously considers multiple positive and negative examples to learn
+distance metrics. By dealing with inter-writer similarity and intra-writer
+variations and focusing on informative examples, the co-tuplet loss addresses
+the limitations of typical metric learning losses. Additionally, we develop
+HanSig, a large-scale Chinese signature dataset, to facilitate the development
+of robust systems for this script. The dataset is available at
+https://github.com/ashleyfhh/HanSig. Experimental results on four benchmark
+datasets in different languages demonstrate the promising performance of our
+method in comparison to state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Space Debris: Are Deep Learning-based Image Enhancements part of the
+  Solution? <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Jamrozik, Vincent Gaudillière, Mohamed Adel Musallam, Djamila Aouada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The volume of space debris currently orbiting the Earth is reaching an
+unsustainable level at an accelerated pace. The detection, tracking,
+identification, and differentiation between orbit-defined, registered
+spacecraft, and rogue/inactive space ``objects'', is critical to asset
+protection. The primary objective of this work is to investigate the validity
+of Deep Neural Network (DNN) solutions to overcome the limitations and image
+artefacts most prevalent when captured with monocular cameras in the visible
+light spectrum. In this work, a hybrid UNet-ResNet34 Deep Learning (DL)
+architecture pre-trained on the ImageNet dataset, is developed. Image
+degradations addressed include blurring, exposure issues, poor contrast, and
+noise. The shortage of space-generated data suitable for supervised DL is also
+addressed. A visual comparison between the URes34P model developed in this work
+and the existing state of the art in deep learning image enhancement methods,
+relevant to images captured in space, is presented. Based upon visual
+inspection, it is determined that our UNet model is capable of correcting for
+space-related image degradations and merits further investigation to reduce its
+computational complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in ISCS23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metrics to Quantify Global Consistency in Synthetic Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Scholz, Benedikt Wiestler, Daniel Rueckert, Martin J. Menten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image synthesis is increasingly being adopted in medical image processing,
+for example for data augmentation or inter-modality image translation. In these
+critical applications, the generated images must fulfill a high standard of
+biological correctness. A particular requirement for these images is global
+consistency, i.e an image being overall coherent and structured so that all
+parts of the image fit together in a realistic and meaningful way. Yet,
+established image quality metrics do not explicitly quantify this property of
+synthetic images. In this work, we introduce two metrics that can measure the
+global consistency of synthetic images on a per-image basis. To measure the
+global consistency, we presume that a realistic image exhibits consistent
+properties, e.g., a person's body fat in a whole-body MRI, throughout the
+depicted object or scene. Hence, we quantify global consistency by predicting
+and comparing explicit attributes of images on patches using supervised trained
+neural networks. Next, we adapt this strategy to an unlabeled setting by
+measuring the similarity of implicit image features predicted by a
+self-supervised trained network. Our results demonstrate that predicting
+explicit attributes of synthetic images on patches can distinguish globally
+consistent from inconsistent images. Implicit representations of images are
+less sensitive to assess global consistency but are still serviceable when
+labeled data is unavailable. Compared to established metrics, such as the FID,
+our method can explicitly measure global consistency on a per-image basis,
+enabling a dedicated analysis of the biological plausibility of single
+synthetic images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoPro: A Visual Analytics Approach for Interactive Video Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianben He, Xingbo Wang, Kam Kwai Wong, Xijie Huang, Changjian Chen, Zixin Chen, Fengjie Wang, Min Zhu, Huamin Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constructing supervised machine learning models for real-world video analysis
+require substantial labeled data, which is costly to acquire due to scarce
+domain expertise and laborious manual inspection. While data programming shows
+promise in generating labeled data at scale with user-defined labeling
+functions, the high dimensional and complex temporal information in videos
+poses additional challenges for effectively composing and evaluating labeling
+functions. In this paper, we propose VideoPro, a visual analytics approach to
+support flexible and scalable video data programming for model steering with
+reduced human effort. We first extract human-understandable events from videos
+using computer vision techniques and treat them as atomic components of
+labeling functions. We further propose a two-stage template mining algorithm
+that characterizes the sequential patterns of these events to serve as labeling
+function templates for efficient data labeling. The visual interface of
+VideoPro facilitates multifaceted exploration, examination, and application of
+the labeling templates, allowing for effective programming of video data at
+scale. Moreover, users can monitor the impact of programming on model
+performance and make informed adjustments during the iterative programming
+process. We demonstrate the efficiency and effectiveness of our approach with
+two case studies and expert interviews.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DriveAdapter: Breaking the Coupling Barrier of Perception and Planning
+  in End-to-End Autonomous Driving <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaosong Jia, Yulu Gao, Li Chen, Junchi Yan, Patrick Langechuan Liu, Hongyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end autonomous driving aims to build a fully differentiable system
+that takes raw sensor data as inputs and directly outputs the planned
+trajectory or control signals of the ego vehicle. State-of-the-art methods
+usually follow the `Teacher-Student' paradigm. The Teacher model uses
+privileged information (ground-truth states of surrounding agents and map
+elements) to learn the driving strategy. The student model only has access to
+raw sensor data and conducts behavior cloning on the data collected by the
+teacher model. By eliminating the noise of the perception part during planning
+learning, state-of-the-art works could achieve better performance with
+significantly less data compared to those coupled ones.
+  However, under the current Teacher-Student paradigm, the student model still
+needs to learn a planning head from scratch, which could be challenging due to
+the redundant and noisy nature of raw sensor inputs and the casual confusion
+issue of behavior cloning. In this work, we aim to explore the possibility of
+directly adopting the strong teacher model to conduct planning while letting
+the student model focus more on the perception part. We find that even equipped
+with a SOTA perception model, directly letting the student model learn the
+required inputs of the teacher model leads to poor driving performance, which
+comes from the large distribution gap between predicted privileged inputs and
+the ground-truth.
+  To this end, we propose DriveAdapter, which employs adapters with the feature
+alignment objective function between the student (perception) and teacher
+(planning) modules. Additionally, since the pure learning-based teacher model
+itself is imperfect and occasionally breaks safety rules, we propose a method
+of action-guided feature learning with a mask for those imperfect teacher
+features to further inject the priors of hand-crafted rules into the learning
+process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Code url:
+  https://github.com/OpenDriveLab/DriveAdapter</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Generation of a Synthetic Event-Based Vision <span class="highlight-title">Dataset</span> for
+  Navigation and Landing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loïc J. Azzalini, Emmanuel Blazquez, Alexander Hadjiivanov, Gabriele Meoni, Dario Izzo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An event-based camera outputs an event whenever a change in scene brightness
+of a preset magnitude is detected at a particular pixel location in the sensor
+plane. The resulting sparse and asynchronous output coupled with the high
+dynamic range and temporal resolution of this novel camera motivate the study
+of event-based cameras for navigation and landing applications. However, the
+lack of real-world and synthetic datasets to support this line of research has
+limited its consideration for onboard use. This paper presents a methodology
+and a software pipeline for generating event-based vision datasets from optimal
+landing trajectories during the approach of a target body. We construct
+sequences of photorealistic images of the lunar surface with the Planet and
+Asteroid Natural Scene Generation Utility at different viewpoints along a set
+of optimal descent trajectories obtained by varying the boundary conditions.
+The generated image sequences are then converted into event streams by means of
+an event-based camera emulator. We demonstrate that the pipeline can generate
+realistic event-based representations of surface features by constructing a
+dataset of 500 trajectories, complete with event streams and motion field
+ground truth data. We anticipate that novel event-based vision datasets can be
+generated using this pipeline to support various spacecraft pose reconstruction
+problems given events as input, and we hope that the proposed methodology would
+attract the attention of researchers working at the intersection of
+neuromorphic vision and guidance navigation and control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape Completion with Prediction of Uncertain Regions <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Humt, Dominik Winkelbauer, Ulrich Hillenbrand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape completion, i.e., predicting the complete geometry of an object from a
+partial observation, is highly relevant for several downstream tasks, most
+notably robotic manipulation. When basing planning or prediction of real grasps
+on object shape reconstruction, an indication of severe geometric uncertainty
+is indispensable. In particular, there can be an irreducible uncertainty in
+extended regions about the presence of entire object parts when given ambiguous
+object views. To treat this important case, we propose two novel methods for
+predicting such uncertain regions as straightforward extensions of any method
+for predicting local spatial occupancy, one through postprocessing occupancy
+scores, the other through direct prediction of an uncertainty indicator. We
+compare these methods together with two known approaches to probabilistic shape
+completion. Moreover, we generate a dataset, derived from ShapeNet, of
+realistically rendered depth images of object views with ground-truth
+annotations for the uncertain regions. We train on this dataset and test each
+method in shape completion and prediction of uncertain regions for known and
+novel object instances and on synthetic and real data. While direct uncertainty
+prediction is by far the most accurate in the segmentation of uncertain
+regions, both novel methods outperform the two baselines in shape completion
+and uncertain region prediction, and avoiding the predicted uncertain regions
+increases the quality of grasps for all tested methods. Web:
+https://github.com/DLR-RM/shape-completion
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures, 2023 IEEE/RSJ International Conference on
+  Intelligent Robots and Systems, IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Image Harmonization with Learnable Augmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Niu, Junyan Cao, Wenyan Cong, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of image harmonization is adjusting the foreground appearance in a
+composite image to make the whole image harmonious. To construct paired
+training images, existing datasets adopt different ways to adjust the
+illumination statistics of foregrounds of real images to produce synthetic
+composite images. However, different datasets have considerable domain gap and
+the performances on small-scale datasets are limited by insufficient training
+data. In this work, we explore learnable augmentation to enrich the
+illumination diversity of small-scale datasets for better harmonization
+performance. In particular, our designed SYthetic COmposite Network (SycoNet)
+takes in a real image with foreground mask and a random vector to learn
+suitable color transformation, which is applied to the foreground of this real
+image to produce a synthetic composite image. Comprehensive experiments
+demonstrate the effectiveness of our proposed learnable augmentation for image
+harmonization. The code of SycoNet is released at
+https://github.com/bcmi/SycoNet-Adaptive-Image-Harmonization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Image Harmonization with Globally Guided Feature Transformation and
+  Relation Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Niu, Linfeng Tan, Xinhao Tao, Junyan Cao, Fengjun Guo, Teng Long, Liqing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a composite image, image harmonization aims to adjust the foreground
+illumination to be consistent with background. Previous methods have explored
+transforming foreground features to achieve competitive performance. In this
+work, we show that using global information to guide foreground feature
+transformation could achieve significant improvement. Besides, we propose to
+transfer the foreground-background relation from real images to composite
+images, which can provide intermediate supervision for the transformed encoder
+features. Additionally, considering the drawbacks of existing harmonization
+datasets, we also contribute a ccHarmony dataset which simulates the natural
+illumination variation. Extensive experiments on iHarmony4 and our contributed
+dataset demonstrate the superiority of our method. Our ccHarmony dataset is
+released at https://github.com/bcmi/Image-Harmonization-Dataset-ccHarmony.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lowis3D: Language-Driven Open-World Instance-Level 3D Scene
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runyu Ding, Jihan Yang, Chuhui Xue, Wenqing Zhang, Song Bai, Xiaojuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-world instance-level scene understanding aims to locate and recognize
+unseen object categories that are not present in the annotated dataset. This
+task is challenging because the model needs to both localize novel 3D objects
+and infer their semantic categories. A key factor for the recent progress in 2D
+open-world perception is the availability of large-scale image-text pairs from
+the Internet, which cover a wide range of vocabulary concepts. However, this
+success is hard to replicate in 3D scenarios due to the scarcity of 3D-text
+pairs. To address this challenge, we propose to harness pre-trained
+vision-language (VL) foundation models that encode extensive knowledge from
+image-text pairs to generate captions for multi-view images of 3D scenes. This
+allows us to establish explicit associations between 3D shapes and
+semantic-rich captions. Moreover, to enhance the fine-grained visual-semantic
+representation learning from captions for object-level categorization, we
+design hierarchical point-caption association methods to learn semantic-aware
+embeddings that exploit the 3D geometry between 3D points and multi-view
+images. In addition, to tackle the localization challenge for novel classes in
+the open-world setting, we develop debiased instance localization, which
+involves training object grouping modules on unlabeled data using
+instance-level pseudo supervision. This significantly improves the
+generalization capabilities of instance grouping and thus the ability to
+accurately locate novel objects. We conduct extensive experiments on 3D
+semantic, instance, and panoptic segmentation tasks, covering indoor and
+outdoor scenes across three datasets. Our method outperforms baseline methods
+by a significant margin in semantic segmentation (e.g. 34.5%$\sim$65.3%),
+instance segmentation (e.g. 21.8%$\sim$54.0%) and panoptic segmentation (e.g.
+14.7%$\sim$43.3%). Code will be available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submit to TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-Grained Sports, Yoga, and Dance Postures Recognition: A Benchmark
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asish Bera, Mita Nasipuri, Ondrej Krejcar, Debotosh Bhattacharjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human body-pose estimation is a complex problem in computer vision. Recent
+research interests have been widened specifically on the Sports, Yoga, and
+Dance (SYD) postures for maintaining health conditions. The SYD pose categories
+are regarded as a fine-grained image classification task due to the complex
+movement of body parts. Deep Convolutional Neural Networks (CNNs) have attained
+significantly improved performance in solving various human body-pose
+estimation problems. Though decent progress has been achieved in yoga postures
+recognition using deep learning techniques, fine-grained sports, and dance
+recognition necessitates ample research attention. However, no benchmark public
+image dataset with sufficient inter-class and intra-class variations is
+available yet to address sports and dance postures classification. To solve
+this limitation, we have proposed two image datasets, one for 102 sport
+categories and another for 12 dance styles. Two public datasets, Yoga-82 which
+contains 82 classes and Yoga-107 represents 107 classes are collected for yoga
+postures. These four SYD datasets are experimented with the proposed deep
+model, SYD-Net, which integrates a patch-based attention (PbA) mechanism on top
+of standard backbone CNNs. The PbA module leverages the self-attention
+mechanism that learns contextual information from a set of uniform and
+multi-scale patches and emphasizes discriminative features to understand the
+semantic correlation among patches. Moreover, random erasing data augmentation
+is applied to improve performance. The proposed SYD-Net has achieved
+state-of-the-art accuracy on Yoga-82 using five base CNNs. SYD-Net's accuracy
+on other datasets is remarkable, implying its efficiency. Our Sports-102 and
+Dance-12 datasets are publicly available at
+https://sites.google.com/view/syd-net/home.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 12 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Learning by Harnessing Adversarial Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhi Chen, Pengfei Zhang, Jingjing Li, Sen Wang, Zi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-Shot Learning (ZSL) aims to recognize unseen classes by generalizing the
+knowledge, i.e., visual and semantic relationships, obtained from seen classes,
+where image augmentation techniques are commonly applied to improve the
+generalization ability of a model. However, this approach can also cause
+adverse effects on ZSL since the conventional augmentation techniques that
+solely depend on single-label supervision is not able to maintain semantic
+information and result in the semantic distortion issue consequently. In other
+words, image argumentation may falsify the semantic (e.g., attribute)
+information of an image. To take the advantage of image augmentations while
+mitigating the semantic distortion issue, we propose a novel ZSL approach by
+Harnessing Adversarial Samples (HAS). HAS advances ZSL through adversarial
+training which takes into account three crucial aspects: (1) robust generation
+by enforcing augmentations to be similar to negative classes, while maintaining
+correct labels, (2) reliable generation by introducing a latent space
+constraint to avert significant deviations from the original data manifold, and
+(3) diverse generation by incorporating attribute-based perturbation by
+adjusting images according to each semantic attribute's localization. Through
+comprehensive experiments on three prominent zero-shot benchmark datasets, we
+demonstrate the effectiveness of our adversarial samples approach in both ZSL
+and Generalized Zero-Shot Learning (GZSL) scenarios. Our source code is
+available at https://github.com/uqzhichen/HASZSL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM International Conference on Multimedia (MM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GradOrth: A Simple yet Efficient Out-of-Distribution Detection with
+  Orthogonal Projection of Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sima Behpour, Thang Doan, Xin Li, Wenbin He, Liang Gou, Liu Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-distribution (OOD) data is crucial for ensuring the safe
+deployment of machine learning models in real-world applications. However,
+existing OOD detection approaches primarily rely on the feature maps or the
+full gradient space information to derive OOD scores neglecting the role of
+most important parameters of the pre-trained network over in-distribution (ID)
+data. In this study, we propose a novel approach called GradOrth to facilitate
+OOD detection based on one intriguing observation that the important features
+to identify OOD data lie in the lower-rank subspace of in-distribution (ID)
+data. In particular, we identify OOD data by computing the norm of gradient
+projection on the subspaces considered important for the in-distribution data.
+A large orthogonal projection value (i.e. a small projection value) indicates
+the sample as OOD as it captures a weak correlation of the ID data. This simple
+yet effective method exhibits outstanding performance, showcasing a notable
+reduction in the average false positive rate at a 95% true positive rate
+(FPR95) of up to 8% when compared to the current state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Adaptation based on Human Feedback for Enhancing Generative Model
+  Denoising Abilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyun-Cheol Park, Sung Ho Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can we apply human feedback into generative model? As answer of this
+question, in this paper, we show the method applied on denoising problem and
+domain adaptation using human feedback. Deep generative models have
+demonstrated impressive results in image denoising. However, current image
+denoising models often produce inappropriate results when applied to domains
+different from the ones they were trained on. If there are `Good' and `Bad'
+result for unseen data, how to raise up quality of `Bad' result. Most methods
+use an approach based on generalization of model. However, these methods
+require target image for training or adapting unseen domain. In this paper, to
+adapting domain, we deal with non-target image for unseen domain, and improve
+specific failed image. To address this, we propose a method for fine-tuning
+inappropriate results generated in a different domain by utilizing human
+feedback. First, we train a generator to denoise images using only the noisy
+MNIST digit '0' images. The denoising generator trained on the source domain
+leads to unintended results when applied to target domain images. To achieve
+domain adaptation, we construct a noise-image denoising generated image data
+set and train a reward model predict human feedback. Finally, we fine-tune the
+generator on the different domain using the reward model with auxiliary loss
+function, aiming to transfer denoising capabilities to target domain. Our
+approach demonstrates the potential to efficiently fine-tune a generator
+trained on one domain using human feedback from another domain, thereby
+enhancing denoising abilities in different domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Model for Camouflaged Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhennan Chen, Rongrong Gao, Tian-Zhu Xiang, Fan Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camouflaged object detection is a challenging task that aims to identify
+objects that are highly similar to their background. Due to the powerful
+noise-to-image denoising capability of denoising diffusion models, in this
+paper, we propose a diffusion-based framework for camouflaged object detection,
+termed diffCOD, a new framework that considers the camouflaged object
+segmentation task as a denoising diffusion process from noisy masks to object
+masks. Specifically, the object mask diffuses from the ground-truth masks to a
+random distribution, and the designed model learns to reverse this noising
+process. To strengthen the denoising learning, the input image prior is encoded
+and integrated into the denoising diffusion model to guide the diffusion
+process. Furthermore, we design an injection attention module (IAM) to interact
+conditional semantic features extracted from the image with the diffusion noise
+embedding via the cross-attention mechanism to enhance denoising learning.
+Extensive experiments on four widely used COD benchmark datasets demonstrate
+that the proposed method achieves favorable performance compared to the
+existing 11 state-of-the-art methods, especially in the detailed texture
+segmentation of camouflaged objects. Our code will be made publicly available
+at: https://github.com/ZNan-Chen/diffCOD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Prototype Learning for Online Continual Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujie Wei, Jiaxin Ye, Zhizhong Huang, Junping Zhang, Hongming Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online continual learning (CL) studies the problem of learning continuously
+from a single-pass data stream while adapting to new data and mitigating
+catastrophic forgetting. Recently, by storing a small subset of old data,
+replay-based methods have shown promising performance. Unlike previous methods
+that focus on sample storage or knowledge distillation against catastrophic
+forgetting, this paper aims to understand why the online learning models fail
+to generalize well from a new perspective of shortcut learning. We identify
+shortcut learning as the key limiting factor for online CL, where the learned
+features may be biased, not generalizable to new tasks, and may have an adverse
+impact on knowledge distillation. To tackle this issue, we present the online
+prototype learning (OnPro) framework for online CL. First, we propose online
+prototype equilibrium to learn representative features against shortcut
+learning and discriminative features to avoid class confusion, ultimately
+achieving an equilibrium status that separates all seen classes well while
+learning new classes. Second, with the feedback of online prototypes, we devise
+a novel adaptive prototypical feedback mechanism to sense the classes that are
+easily misclassified and then enhance their boundaries. Extensive experimental
+results on widely-used benchmark datasets demonstrate the superior performance
+of OnPro over the state-of-the-art baseline methods. Source code is available
+at https://github.com/weilllllls/OnPro.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making the V in Text-VQA Matter <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamanthak Hegde, Soumya Jahagirdar, Shankar Gangisetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-based VQA aims at answering questions by reading the text present in the
+images. It requires a large amount of scene-text relationship understanding
+compared to the VQA task. Recent studies have shown that the question-answer
+pairs in the dataset are more focused on the text present in the image but less
+importance is given to visual features and some questions do not require
+understanding the image. The models trained on this dataset predict biased
+answers due to the lack of understanding of visual context. For example, in
+questions like "What is written on the signboard?", the answer predicted by the
+model is always "STOP" which makes the model to ignore the image. To address
+these issues, we propose a method to learn visual features (making V matter in
+TextVQA) along with the OCR features and question features using VQA dataset as
+external knowledge for Text-based VQA. Specifically, we combine the TextVQA
+dataset and VQA dataset and train the model on this combined dataset. Such a
+simple, yet effective approach increases the understanding and correlation
+between the image features and text present in the image, which helps in the
+better answering of questions. We further test the model on different datasets
+and compare their qualitative and quantitative results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the CVPR 2023 Workshop on Open-Domain Reasoning Under
+  Multi-Modal Settings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fundus-Enhanced Disease-Aware Distillation Model for Retinal Disease
+  Classification from OCT Images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lehan Wang, Weihang Dai, Mei Jin, Chubin Ou, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Coherence Tomography (OCT) is a novel and effective screening tool
+for ophthalmic examination. Since collecting OCT images is relatively more
+expensive than fundus photographs, existing methods use multi-modal learning to
+complement limited OCT data with additional context from fundus images.
+However, the multi-modal framework requires eye-paired datasets of both
+modalities, which is impractical for clinical use. To address this problem, we
+propose a novel fundus-enhanced disease-aware distillation model (FDDM), for
+retinal disease classification from OCT images. Our framework enhances the OCT
+model during training by utilizing unpaired fundus images and does not require
+the use of fundus images during testing, which greatly improves the
+practicality and efficiency of our method for clinical use. Specifically, we
+propose a novel class prototype matching to distill disease-related information
+from the fundus model to the OCT model and a novel class similarity alignment
+to enforce consistency between disease distribution of both modalities.
+Experimental results show that our proposed approach outperforms single-modal,
+multi-modal, and state-of-the-art distillation methods for retinal disease
+classification. Code is available at https://github.com/xmed-lab/FDDM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a conference paper at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Study of Unsupervised Evaluation Metrics for Practical and Automatic
+  Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Chen, Zepeng Gao, Shuai Zhao, Qibo Qiu, Wenxiao Wang, Binbin Lin, Xiaofei He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) methods facilitate the transfer of
+models to target domains without labels. However, these methods necessitate a
+labeled target validation set for hyper-parameter tuning and model selection.
+In this paper, we aim to find an evaluation metric capable of assessing the
+quality of a transferred model without access to target validation labels. We
+begin with the metric based on mutual information of the model prediction.
+Through empirical analysis, we identify three prevalent issues with this
+metric: 1) It does not account for the source structure. 2) It can be easily
+attacked. 3) It fails to detect negative transfer caused by the over-alignment
+of source and target features. To address the first two issues, we incorporate
+source accuracy into the metric and employ a new MLP classifier that is held
+out during training, significantly improving the result. To tackle the final
+issue, we integrate this enhanced metric with data augmentation, resulting in a
+novel unsupervised UDA metric called the Augmentation Consistency Metric (ACM).
+Additionally, we empirically demonstrate the shortcomings of previous
+experiment settings and conduct large-scale experiments to validate the
+effectiveness of our proposed metric. Furthermore, we employ our metric to
+automatically search for the optimal hyper-parameter set, achieving superior
+performance compared to manually tuned sets across four common benchmarks.
+Codes will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Positive-Unlabeled Learning via Noise Negative Sample
+  Self-correction <span class="chip">KDD2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchi Zhu, Lu Wang, Pu Zhao, Chao Du, Wei Zhang, Hang Dong, Bo Qiao, Qingwei Lin, Saravan Rajmohan, Dongmei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from positive and unlabeled data is known as positive-unlabeled (PU)
+learning in literature and has attracted much attention in recent years. One
+common approach in PU learning is to sample a set of pseudo-negatives from the
+unlabeled data using ad-hoc thresholds so that conventional supervised methods
+can be applied with both positive and negative samples. Owing to the label
+uncertainty among the unlabeled data, errors of misclassifying unlabeled
+positive samples as negative samples inevitably appear and may even accumulate
+during the training processes. Those errors often lead to performance
+degradation and model instability. To mitigate the impact of label uncertainty
+and improve the robustness of learning with positive and unlabeled data, we
+propose a new robust PU learning method with a training strategy motivated by
+the nature of human learning: easy cases should be learned first. Similar
+intuition has been utilized in curriculum learning to only use easier cases in
+the early stage of training before introducing more complex cases.
+Specifically, we utilize a novel ``hardness'' measure to distinguish unlabeled
+samples with a high chance of being negative from unlabeled samples with large
+label noise. An iterative training strategy is then implemented to fine-tune
+the selection of negative samples during the training process in an iterative
+manner to include more ``easy'' samples in the early stage of training.
+Extensive experimental validations over a wide range of learning tasks show
+that this approach can effectively improve the accuracy and stability of
+learning with positive and unlabeled data. Our code is available at
+https://github.com/woriazzc/Robust-PU
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KDD2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Ultra-High-Definition Image Reflection Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyuan Zhang, Zhenbo Song, Kaihao Zhang, Wenhan Luo, Zhaoxin Fan, Jianfeng Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning based methods have achieved significant success in the task of
+single image reflection removal (SIRR). However, the majority of these methods
+are focused on High-Definition/Standard-Definition (HD/SD) images, while
+ignoring higher resolution images such as Ultra-High-Definition (UHD) images.
+With the increasing prevalence of UHD images captured by modern devices, in
+this paper, we aim to address the problem of UHD SIRR. Specifically, we first
+synthesize two large-scale UHD datasets, UHDRR4K and UHDRR8K. The UHDRR4K
+dataset consists of $2,999$ and $168$ quadruplets of images for training and
+testing respectively, and the UHDRR8K dataset contains $1,014$ and $105$
+quadruplets. To the best of our knowledge, these two datasets are the first
+largest-scale UHD datasets for SIRR. Then, we conduct a comprehensive
+evaluation of six state-of-the-art SIRR methods using the proposed datasets.
+Based on the results, we provide detailed discussions regarding the strengths
+and limitations of these methods when applied to UHD images. Finally, we
+present a transformer-based architecture named RRFormer for reflection removal.
+RRFormer comprises three modules, namely the Prepossessing Embedding Module,
+Self-attention Feature Extraction Module, and Multi-scale Spatial Feature
+Extraction Module. These modules extract hypercolumn features, global and
+partial attention features, and multi-scale spatial features, respectively. To
+ensure effective training, we utilize three terms in our loss function: pixel
+loss, feature loss, and adversarial loss. We demonstrate through experimental
+results that RRFormer achieves state-of-the-art performance on both the non-UHD
+dataset and our proposed UHDRR datasets. The code and datasets are publicly
+available at
+https://github.com/Liar-zzy/Benchmarking-Ultra-High-Definition-Single-Image-Reflection-Removal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Algonauts Project 2023 Challenge: UARK-UAlbany Team Solution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan-Bac Nguyen, Xudong Liu, Xin Li, Khoa Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents our solutions to the Algonauts Project 2023 Challenge. The
+primary objective of the challenge revolves around employing computational
+models to anticipate brain responses captured during participants' observation
+of intricate natural visual scenes. The goal is to predict brain responses
+across the entire visual brain, as it is the region where the most reliable
+responses to images have been observed. We constructed an image-based brain
+encoder through a two-step training process to tackle this challenge.
+Initially, we created a pretrained encoder using data from all subjects. Next,
+we proceeded to fine-tune individual subjects. Each step employed different
+training strategies, such as different loss functions and objectives, to
+introduce diversity. Ultimately, our solution constitutes an ensemble of
+multiple unique encoders. The code is available at
+https://github.com/uark-cviu/Algonauts2023
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Algonauts Project 2023 Challenge</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Pixel-based MIM by Reducing Wasted Modeling Capability <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Songyang Zhang, Jiacheng Chen, Zhaohui Yu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been significant progress in Masked Image Modeling (MIM). Existing
+MIM methods can be broadly categorized into two groups based on the
+reconstruction target: pixel-based and tokenizer-based approaches. The former
+offers a simpler pipeline and lower computational cost, but it is known to be
+biased toward high-frequency details. In this paper, we provide a set of
+empirical studies to confirm this limitation of pixel-based MIM and propose a
+new method that explicitly utilizes low-level features from shallow layers to
+aid pixel reconstruction. By incorporating this design into our base method,
+MAE, we reduce the wasted modeling capability of pixel-based MIM, improving its
+convergence and achieving non-trivial improvements across various downstream
+tasks. To the best of our knowledge, we are the first to systematically
+investigate multi-level feature fusion for isotropic architectures like the
+standard Vision Transformer (ViT). Notably, when applied to a smaller model
+(e.g., ViT-S), our method yields significant performance gains, such as 1.2\%
+on fine-tuning, 2.8\% on linear probing, and 2.6\% on semantic segmentation.
+Code and models are available at https://github.com/open-mmlab/mmpretrain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LGViT: Dynamic Early Exiting for Accelerating Vision <span class="highlight-title">Transformer</span> <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanyu Xu, Jiawei Hao, Li Shen, Han Hu, Yong Luo, Hui Lin, Jialie Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the efficient deployment and acceleration of powerful vision
+transformers (ViTs) on resource-limited edge devices for providing multimedia
+services have become attractive tasks. Although early exiting is a feasible
+solution for accelerating inference, most works focus on convolutional neural
+networks (CNNs) and transformer models in natural language processing
+(NLP).Moreover, the direct application of early exiting methods to ViTs may
+result in substantial performance degradation. To tackle this challenge, we
+systematically investigate the efficacy of early exiting in ViTs and point out
+that the insufficient feature representations in shallow internal classifiers
+and the limited ability to capture target semantic information in deep internal
+classifiers restrict the performance of these methods. We then propose an early
+exiting framework for general ViTs termed LGViT, which incorporates
+heterogeneous exiting heads, namely, local perception head and global
+aggregation head, to achieve an efficiency-accuracy trade-off. In particular,
+we develop a novel two-stage training scheme, including end-to-end training and
+self-distillation with the backbone frozen to generate early exiting ViTs,
+which facilitates the fusion of global and local information extracted by the
+two types of heads. We conduct extensive experiments using three popular ViT
+backbones on three vision datasets. Results demonstrate that our LGViT can
+achieve competitive performance with approximately 1.8 $\times$ speed-up.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unleashing the Power of <span class="highlight-title">Self-Supervised</span> Image Denoising: A Comprehensive
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Zhang, Fangfang Zhou, Yuanzhou Wei, Xiao Yang, Yuan Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of deep learning has brought a revolutionary transformation to
+image denoising techniques. However, the persistent challenge of acquiring
+noise-clean pairs for supervised methods in real-world scenarios remains
+formidable, necessitating the exploration of more practical self-supervised
+image denoising. This paper focuses on self-supervised image denoising methods
+that offer effective solutions to address this challenge. Our comprehensive
+review thoroughly analyzes the latest advancements in self-supervised image
+denoising approaches, categorizing them into three distinct classes: General
+methods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.
+For each class, we provide a concise theoretical analysis along with their
+practical applications. To assess the effectiveness of these methods, we
+present both quantitative and qualitative experimental results on various
+datasets, utilizing classical algorithms as benchmarks. Additionally, we
+critically discuss the current limitations of these methods and propose
+promising directions for future research. By offering a detailed overview of
+recent developments in self-supervised image denoising, this review serves as
+an invaluable resource for researchers and practitioners in the field,
+facilitating a deeper understanding of this emerging domain and inspiring
+further advancements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Partitioned Saliency Ranking with Dense Pyramid <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengxiao Sun, Yan Xu, Jialun Pei, Haopeng Fang, He Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, saliency ranking has emerged as a challenging task focusing
+on assessing the degree of saliency at instance-level. Being subjective, even
+humans struggle to identify the precise order of all salient instances.
+Previous approaches undertake the saliency ranking by directly sorting the rank
+scores of salient instances, which have not explicitly resolved the inherent
+ambiguities. To overcome this limitation, we propose the ranking by partition
+paradigm, which segments unordered salient instances into partitions and then
+ranks them based on the correlations among these partitions. The ranking by
+partition paradigm alleviates ranking ambiguities in a general sense, as it
+consistently improves the performance of other saliency ranking models.
+Additionally, we introduce the Dense Pyramid Transformer (DPT) to enable global
+cross-scale interactions, which significantly enhances feature interactions
+with reduced computational burden. Extensive experiments demonstrate that our
+approach outperforms all existing methods. The code for our method is available
+at \url{https://github.com/ssecv/PSR}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Scene and Semantic Features for Multi-modal Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00228v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00228v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifeng Wang, Ramesh Sankaranarayana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic emotion recognition is a hot topic with a wide range of
+applications. Much work has been done in the area of automatic emotion
+recognition in recent years. The focus has been mainly on using the
+characteristics of a person such as speech, facial expression and pose for this
+purpose. However, the processing of scene and semantic features for emotion
+recognition has had limited exploration. In this paper, we propose to use
+combined scene and semantic features, along with personal features, for
+multi-modal emotion recognition. Scene features will describe the environment
+or context in which the target person is operating. The semantic feature can
+include objects that are present in the environment, as well as their
+attributes and relationships with the target person. In addition, we use a
+modified EmbraceNet to extract features from the images, which is trained to
+learn both the body and pose features simultaneously. By fusing both body and
+pose features, the EmbraceNet can improve the accuracy and robustness of the
+model, particularly when dealing with partially missing data. This is because
+having both body and pose features provides a more complete representation of
+the subject in the images, which can help the model to make more accurate
+predictions even when some parts of body are missing. We demonstrate the
+efficiency of our method on the benchmark EMOTIC dataset. We report an average
+precision of 40.39\% across the 26 emotion categories, which is a 5\%
+improvement over previous approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boundary Difference Over Union Loss For Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Sun, Zhiming Luo, Shaozi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is crucial for clinical diagnosis. However,
+current losses for medical image segmentation mainly focus on overall
+segmentation results, with fewer losses proposed to guide boundary
+segmentation. Those that do exist often need to be used in combination with
+other losses and produce ineffective results. To address this issue, we have
+developed a simple and effective loss called the Boundary Difference over Union
+Loss (Boundary DoU Loss) to guide boundary region segmentation. It is obtained
+by calculating the ratio of the difference set of prediction and ground truth
+to the union of the difference set and the partial intersection set. Our loss
+only relies on region calculation, making it easy to implement and training
+stable without needing any additional losses. Additionally, we use the target
+size to adaptively adjust attention applied to the boundary regions.
+Experimental results using UNet, TransUNet, and Swin-UNet on two datasets (ACDC
+and Synapse) demonstrate the effectiveness of our proposed loss function. Code
+is available at https://github.com/sunfan-bvb/BoundaryDoULoss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-goal Audio-visual Navigation using Sound Direction Map <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haru Kondoh, Asako Kanezaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, there has been a great deal of research on
+navigation tasks in indoor environments using deep reinforcement learning
+agents. Most of these tasks use only visual information in the form of
+first-person images to navigate to a single goal. More recently, tasks that
+simultaneously use visual and auditory information to navigate to the sound
+source and even navigation tasks with multiple goals instead of one have been
+proposed. However, there has been no proposal for a generalized navigation task
+combining these two types of tasks and using both visual and auditory
+information in a situation where multiple sound sources are goals. In this
+paper, we propose a new framework for this generalized task: multi-goal
+audio-visual navigation. We first define the task in detail, and then we
+investigate the difficulty of the multi-goal audio-visual navigation task
+relative to the current navigation tasks by conducting experiments in various
+situations. The research shows that multi-goal audio-visual navigation has the
+difficulty of the implicit need to separate the sources of sound. Next, to
+mitigate the difficulties in this new task, we propose a method named sound
+direction map (SDM), which dynamically localizes multiple sound sources in a
+learning-based manner while making use of past memories. Experimental results
+show that the use of SDM significantly improves the performance of multiple
+baseline methods, regardless of the number of goals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned
+  Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaochao Zhou, Syed Hasib Akhter Faruqui, Abhinav Patel, Ramez N. Abdalla, Michael C. Hurley, Ali Shaibani, Matthew B. Potts, Babak S. Jahromi, Leon Cho, Sameer A. Ansari, Donald R. Cantrell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many tasks performed in image-guided, mini-invasive, medical procedures can
+be cast as pose estimation problems, where an X-ray projection is utilized to
+reach a target in 3D space. Recent advances in the differentiable rendering of
+optically reflective materials have enabled state-of-the-art performance in RGB
+camera view synthesis and pose estimation. Expanding on these prior works, we
+introduce new methods for pose estimation of radiolucent objects using X-ray
+projections, and we demonstrate the critical role of optimal view synthesis in
+performing this task. We first develop an algorithm (DiffDRR) that efficiently
+computes Digitally Reconstructed Radiographs (DRRs) and leverages automatic
+differentiation within TensorFlow. In conjunction with classic CBCT
+reconstruction algorithms, we perform pose estimation by gradient descent using
+a loss function that quantifies the similarity of the DRR synthesized from a
+randomly initialized pose and the true fluoroscopic image at the target pose.
+We propose two novel methods for high-fidelity view synthesis, Neural Tuned
+Tomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely
+on classic CBCT; NeTT directly optimizes the CBCT densities, while the non-zero
+values of mNeRF are constrained by a 3D mask of the anatomic region segmented
+from CBCT. We demonstrate that both NeTT and mNeRF distinctly improve pose
+estimation within our framework. By defining a successful pose estimate to be a
+3D angle error of less than 3 deg, we find that NeTT and mNeRF can achieve
+similar results, both with overall success rates more than 93%. Furthermore, we
+show that a NeTT trained for a single subject can generalize to synthesize
+high-fidelity DRRs and ensure robust pose estimations for all other subjects.
+Therefore, we suggest that NeTT is an attractive option for robust pose
+estimation using fluoroscopic projections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scene Separation & Data Selection: Temporal Segmentation Algorithm for
+  Real-Time Video Stream Analysis <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuelin Xin, Zihan Zhou, Yuxuan Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present 2SDS (Scene Separation and Data Selection algorithm), a temporal
+segmentation algorithm used in real-time video stream interpretation. It
+complements CNN-based models to make use of temporal information in videos.
+2SDS can detect the change between scenes in a video stream by com-paring the
+image difference between two frames. It separates a video into segments
+(scenes), and by combining itself with a CNN model, 2SDS can select the optimal
+result for each scene. In this paper, we will be discussing some basic methods
+and concepts behind 2SDS, as well as presenting some preliminary experiment
+results regarding 2SDS. During these experiments, 2SDS has achieved an overall
+accuracy of over 90%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, at IJCAI-ECAI 2022 workshop, First International
+  Workshop on Spatio-Temporal Reasoning and Learning, July 24, 2022, Vienna,
+  Austria</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training on Foveated Images Improves Robustness to Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad A. Shah, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) have been shown to be vulnerable to adversarial
+attacks -- subtle, perceptually indistinguishable perturbations of inputs that
+change the response of the model. In the context of vision, we hypothesize that
+an important contributor to the robustness of human visual perception is
+constant exposure to low-fidelity visual stimuli in our peripheral vision. To
+investigate this hypothesis, we develop \RBlur, an image transform that
+simulates the loss in fidelity of peripheral vision by blurring the image and
+reducing its color saturation based on the distance from a given fixation
+point. We show that compared to DNNs trained on the original images, DNNs
+trained on images transformed by \RBlur are substantially more robust to
+adversarial attacks, as well as other, non-adversarial, corruptions, achieving
+up to 25\% higher accuracy on perturbed data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Approaches in Pavement Distress Identification: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00828v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00828v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sizhe Guan, Haolan Liu, Hamid R. Pourreza, Hamidreza Mahyar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a comprehensive review of recent advancements in image
+processing and deep learning techniques for pavement distress detection and
+classification, a critical aspect in modern pavement management systems. The
+conventional manual inspection process conducted by human experts is gradually
+being superseded by automated solutions, leveraging machine learning and deep
+learning algorithms to enhance efficiency and accuracy. The ability of these
+algorithms to discern patterns and make predictions based on extensive datasets
+has revolutionized the domain of pavement distress identification. The paper
+investigates the integration of unmanned aerial vehicles (UAVs) for data
+collection, offering unique advantages such as aerial perspectives and
+efficient coverage of large areas. By capturing high-resolution images, UAVs
+provide valuable data that can be processed using deep learning algorithms to
+detect and classify various pavement distresses effectively. While the primary
+focus is on 2D image processing, the paper also acknowledges the challenges
+associated with 3D images, such as sensor limitations and computational
+requirements. Understanding these challenges is crucial for further
+advancements in the field. The findings of this review significantly contribute
+to the evolution of pavement distress detection, fostering the development of
+efficient pavement management systems. As automated approaches continue to
+mature, the implementation of deep learning techniques holds great promise in
+ensuring safer and more durable road infrastructure for the benefit of society.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing Uncertainty in Imbalanced Histopathology Image Classification
+  of HER2 Breast Cancer: An interpretable Ensemble Approach with Threshold
+  Filtered Single Instance Evaluation (SIE) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Sakib Hossain Shovon, M. F. Mridha, Khan Md Hasib, Sultan Alfarhood, Mejdl Safran, Dunren Che
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast Cancer (BC) is among women's most lethal health concerns. Early
+diagnosis can alleviate the mortality rate by helping patients make efficient
+treatment decisions. Human Epidermal Growth Factor Receptor (HER2) has become
+one the most lethal subtype of BC. According to the College of American
+Pathologists/American Society of Clinical Oncology (CAP/ASCO), the severity
+level of HER2 expression can be classified between 0 and 3+ range. HER2 can be
+detected effectively from immunohistochemical (IHC) and, hematoxylin \& eosin
+(HE) images of different classes such as 0, 1+, 2+, and 3+. An ensemble
+approach integrated with threshold filtered single instance evaluation (SIE)
+technique has been proposed in this study to diagnose BC from the
+multi-categorical expression of HER2 subtypes. Initially, DenseNet201 and
+Xception have been ensembled into a single classifier as feature extractors
+with an effective combination of global average pooling, dropout layer, dense
+layer with a swish activation function, and l2 regularizer, batch
+normalization, etc. After that, extracted features has been processed through
+single instance evaluation (SIE) to determine different confidence levels and
+adjust decision boundary among the imbalanced classes. This study has been
+conducted on the BC immunohistochemical (BCI) dataset, which is classified by
+pathologists into four stages of HER2 BC. This proposed approach known as
+DenseNet201-Xception-SIE with a threshold value of 0.7 surpassed all other
+existing state-of-art models with an accuracy of 97.12\%, precision of 97.15\%,
+and recall of 97.68\% on H\&E data and, accuracy of 97.56\%, precision of
+97.57\%, and recall of 98.00\% on IHC data respectively, maintaining momentous
+improvement. Finally, Grad-CAM and Guided Grad-CAM have been employed in this
+study to interpret, how TL-based model works on the histopathology dataset and
+make decisions from the data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Body Knowledge and Uncertainty Modeling for Monocular 3D Human Body
+  Reconstruction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Zhang, Hanjing Wang, Jeffrey O. Kephart, Qiang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While 3D body reconstruction methods have made remarkable progress recently,
+it remains difficult to acquire the sufficiently accurate and numerous 3D
+supervisions required for training. In this paper, we propose \textbf{KNOWN}, a
+framework that effectively utilizes body \textbf{KNOW}ledge and
+u\textbf{N}certainty modeling to compensate for insufficient 3D supervisions.
+KNOWN exploits a comprehensive set of generic body constraints derived from
+well-established body knowledge. These generic constraints precisely and
+explicitly characterize the reconstruction plausibility and enable 3D
+reconstruction models to be trained without any 3D data. Moreover, existing
+methods typically use images from multiple datasets during training, which can
+result in data noise (\textit{e.g.}, inconsistent joint annotation) and data
+imbalance (\textit{e.g.}, minority images representing unusual poses or
+captured from challenging camera views). KNOWN solves these problems through a
+novel probabilistic framework that models both aleatoric and epistemic
+uncertainty. Aleatoric uncertainty is encoded in a robust Negative
+Log-Likelihood (NLL) training loss, while epistemic uncertainty is used to
+guide model refinement. Experiments demonstrate that KNOWN's body
+reconstruction outperforms prior weakly-supervised approaches, particularly on
+the challenging minority images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid-SORT: Weak Cues Matter for Online Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzhan Yang, Guangxin Han, Bin Yan, Wenhua Zhang, Jinqing Qi, Huchuan Lu, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Object Tracking (MOT) aims to detect and associate all desired objects
+across frames. Most methods accomplish the task by explicitly or implicitly
+leveraging strong cues (i.e., spatial and appearance information), which
+exhibit powerful instance-level discrimination. However, when object occlusion
+and clustering occur, both spatial and appearance information will become
+ambiguous simultaneously due to the high overlap between objects. In this
+paper, we demonstrate that this long-standing challenge in MOT can be
+efficiently and effectively resolved by incorporating weak cues to compensate
+for strong cues. Along with velocity direction, we introduce the confidence
+state and height state as potential weak cues. With superior performance, our
+method still maintains Simple, Online and Real-Time (SORT) characteristics.
+Furthermore, our method shows strong generalization for diverse trackers and
+scenarios in a plug-and-play and training-free manner. Significant and
+consistent improvements are observed when applying our method to 5 different
+representative trackers. Further, by leveraging both strong and weak cues, our
+method Hybrid-SORT achieves superior performance on diverse benchmarks,
+including MOT17, MOT20, and especially DanceTrack where interaction and
+occlusion are frequent and severe. The code and models are available at
+https://github.com/ymzis69/HybirdSORT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Fidelity Eye Animatable Neural Radiance Fields for Human Face 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengfei Wang, Zhongqun Zhang, Yihua Cheng, Hyung Jin Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face rendering using neural radiance fields (NeRF) is a rapidly developing
+research area in computer vision. While recent methods primarily focus on
+controlling facial attributes such as identity and expression, they often
+overlook the crucial aspect of modeling eyeball rotation, which holds
+importance for various downstream tasks. In this paper, we aim to learn a face
+NeRF model that is sensitive to eye movements from multi-view images. We
+address two key challenges in eye-aware face NeRF learning: how to effectively
+capture eyeball rotation for training and how to construct a manifold for
+representing eyeball rotation. To accomplish this, we first fit FLAME, a
+well-established parametric face model, to the multi-view images considering
+multi-view consistency. Subsequently, we introduce a new Dynamic Eye-aware NeRF
+(DeNeRF). DeNeRF transforms 3D points from different views into a canonical
+space to learn a unified face NeRF model. We design an eye deformation field
+for the transformation, including rigid transformation, e.g., eyeball rotation,
+and non-rigid transformation. Through experiments conducted on the ETH-XGaze
+dataset, we demonstrate that our model is capable of generating high-fidelity
+images with accurate eyeball rotation and non-rigid periocular deformation,
+even under novel viewing angles. Furthermore, we show that utilizing the
+rendered images can effectively enhance gaze estimation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decomposition Ascribed Synergistic Learning for Unified Image
+  Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghao Zhang, Jie Huang, Man Zhou, Chongyi Li, Feng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning to restore multiple image degradations within a single model is
+quite beneficial for real-world applications. Nevertheless, existing works
+typically concentrate on regarding each degradation independently, while their
+relationship has been less exploited to ensure the synergistic learning. To
+this end, we revisit the diverse degradations through the lens of singular
+value decomposition, with the observation that the decomposed singular vectors
+and singular values naturally undertake the different types of degradation
+information, dividing various restoration tasks into two groups,\ie, singular
+vector dominated and singular value dominated. The above analysis renders a
+more unified perspective to ascribe the diverse degradations, compared to
+previous task-level independent learning. The dedicated optimization of
+degraded singular vectors and singular values inherently utilizes the potential
+relationship among diverse restoration tasks, attributing to the Decomposition
+Ascribed Synergistic Learning (DASL). Specifically, DASL comprises two
+effective operators, namely, Singular VEctor Operator (SVEO) and Singular VAlue
+Operator (SVAO), to favor the decomposed optimization, which can be lightly
+integrated into existing convolutional image restoration backbone. Moreover,
+the congruous decomposition loss has been devised for auxiliary. Extensive
+experiments on blended five image restoration tasks demonstrate the
+effectiveness of our method, including image deraining, image dehazing, image
+denoising, image deblurring, and low-light image enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Bias Amplification Paradox in Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preethi Seshadri, Sameer Singh, Yanai Elazar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias amplification is a phenomenon in which models increase imbalances
+present in the training data. In this paper, we study bias amplification in the
+text-to-image domain using Stable Diffusion by comparing gender ratios in
+training vs. generated images. We find that the model appears to amplify
+gender-occupation biases found in the training data (LAION). However, we
+discover that amplification can largely be attributed to discrepancies between
+training captions and model prompts. For example, an inherent difference is
+that captions from the training data often contain explicit gender information
+while the prompts we use do not, which leads to a distribution shift and
+consequently impacts bias measures. Once we account for various distributional
+differences between texts used for training and generation, we observe that
+amplification decreases considerably. Our findings illustrate the challenges of
+comparing biases in models and the data they are trained on, and highlight
+confounding factors that contribute to bias amplification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ada-DQA: Adaptive Diverse Quality-aware Feature Acquisition for Video
+  Quality Assessment <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Liu, Mingda Wu, Kun Yuan, Ming Sun, Yansong Tang, Chuanchuan Zheng, Xing Wen, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video quality assessment (VQA) has attracted growing attention in recent
+years. While the great expense of annotating large-scale VQA datasets has
+become the main obstacle for current deep-learning methods. To surmount the
+constraint of insufficient training data, in this paper, we first consider the
+complete range of video distribution diversity (\ie content, distortion,
+motion) and employ diverse pretrained models (\eg architecture, pretext task,
+pre-training dataset) to benefit quality representation. An Adaptive Diverse
+Quality-aware feature Acquisition (Ada-DQA) framework is proposed to capture
+desired quality-related features generated by these frozen pretrained models.
+By leveraging the Quality-aware Acquisition Module (QAM), the framework is able
+to extract more essential and relevant features to represent quality. Finally,
+the learned quality representation is utilized as supplementary supervisory
+information, along with the supervision of the labeled quality score, to guide
+the training of a relatively lightweight VQA model in a knowledge distillation
+manner, which largely reduces the computational cost during inference.
+Experimental results on three mainstream no-reference VQA benchmarks clearly
+show the superior performance of Ada-DQA in comparison with current
+state-of-the-art approaches without using extra training data of VQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, to appear in ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELFNet: Evidential Local-global Fusion for Stereo Matching <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00728v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00728v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieming Lou, Weide Liu, Zhuo Chen, Fayao Liu, Jun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although existing stereo matching models have achieved continuous
+improvement, they often face issues related to trustworthiness due to the
+absence of uncertainty estimation. Additionally, effectively leveraging
+multi-scale and multi-view knowledge of stereo pairs remains unexplored. In
+this paper, we introduce the \textbf{E}vidential \textbf{L}ocal-global
+\textbf{F}usion (ELF) framework for stereo matching, which endows both
+uncertainty estimation and confidence-aware fusion with trustworthy heads.
+Instead of predicting the disparity map alone, our model estimates an
+evidential-based disparity considering both aleatoric and epistemic
+uncertainties. With the normal inverse-Gamma distribution as a bridge, the
+proposed framework realizes intra evidential fusion of multi-level predictions
+and inter evidential fusion between cost-volume-based and transformer-based
+stereo matching. Extensive experimental results show that the proposed
+framework exploits multi-view information effectively and achieves
+state-of-the-art overall performance both on accuracy and cross-domain
+generalization.
+  The codes are available at https://github.com/jimmy19991222/ELFNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Semantic Consistency for Cross-domain Few-shot Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengchu Lu, Yuanjie Shao, Xiang Wang, Changxin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain few-shot classification (CD-FSC) aims to identify novel target
+classes with a few samples, assuming that there exists a domain shift between
+source and target domains. Existing state-of-the-art practices typically
+pre-train on source domain and then finetune on the few-shot target data to
+yield task-adaptive representations. Despite promising progress, these methods
+are prone to overfitting the limited target distribution since data-scarcity
+and ignore the transferable knowledge learned in the source domain. To
+alleviate this problem, we propose a simple plug-and-play Adaptive Semantic
+Consistency (ASC) framework, which improves cross-domain robustness by
+preserving source transfer capability during the finetuning stage. Concretely,
+we reuse the source images in the pretraining phase and design an adaptive
+weight assignment strategy to highlight the samples similar to target domain,
+aiming to aggregate informative target-related knowledge from source domain.
+Subsequently, a semantic consistency regularization is applied to constrain the
+consistency between the semantic features of the source images output by the
+source model and target model. In this way, the proposed ASC enables explicit
+transfer of source domain knowledge to prevent the model from overfitting the
+target domain. Extensive experiments on multiple benchmarks demonstrate the
+effectiveness of the proposed ASC, and ASC provides consistent improvements
+over the baselines. The source code will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent-Shift: Gradient of Entropy Helps Neural Codecs <span class="chip">ICIP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammet Balcilar, Bharath Bhushan Damodaran, Karam Naser, Franck Galpin, Pierre Hellier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end image/video codecs are getting competitive compared to traditional
+compression techniques that have been developed through decades of manual
+engineering efforts. These trainable codecs have many advantages over
+traditional techniques such as easy adaptation on perceptual distortion metrics
+and high performance on specific domains thanks to their learning ability.
+However, state of the art neural codecs does not take advantage of the
+existence of gradient of entropy in decoding device. In this paper, we
+theoretically show that gradient of entropy (available at decoder side) is
+correlated with the gradient of the reconstruction error (which is not
+available at decoder side). We then demonstrate experimentally that this
+gradient can be used on various compression methods, leading to a $1-2\%$ rate
+savings for the same quality. Our method is orthogonal to other improvements
+and brings independent rate savings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published to ICIP2023, 6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image Denoising and the Generative Accumulation of Photons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06607v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06607v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Krull, Hector Basevi, Benjamin Salmon, Andre Zeug, Franziska Müller, Samuel Tonks, Leela Muppala, Ales Leonardis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a fresh perspective on shot noise corrupted images and noise
+removal. By viewing image formation as the sequential accumulation of photons
+on a detector grid, we show that a network trained to predict where the next
+photon could arrive is in fact solving the minimum mean square error (MMSE)
+denoising task. This new perspective allows us to make three contributions: We
+present a new strategy for self-supervised denoising, We present a new method
+for sampling from the posterior of possible solutions by iteratively sampling
+and adding small numbers of photons to the image. We derive a full generative
+model by starting this process from an empty canvas. We call this approach
+generative accumulation of photons (GAP). We evaluate our method quantitatively
+and qualitatively on 4 new fluorescence microscopy datasets, which will be made
+available to the community. We find that it outperforms supervised,
+self-supervised and unsupervised baselines or performs on-par.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper with supplement. Typos corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RibSeg v2: A Large-scale Benchmark for Rib Labeling and Anatomical
+  Centerline Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09309v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09309v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Jin, Shixuan Gu, Donglai Wei, Jason Ken Adhinarta, Kaiming Kuang, Yongjie Jessica Zhang, Hanspeter Pfister, Bingbing Ni, Jiancheng Yang, Ming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic rib labeling and anatomical centerline extraction are common
+prerequisites for various clinical applications. Prior studies either use
+in-house datasets that are inaccessible to communities, or focus on rib
+segmentation that neglects the clinical significance of rib labeling. To
+address these issues, we extend our prior dataset (RibSeg) on the binary rib
+segmentation task to a comprehensive benchmark, named RibSeg v2, with 660 CT
+scans (15,466 individual ribs in total) and annotations manually inspected by
+experts for rib labeling and anatomical centerline extraction. Based on the
+RibSeg v2, we develop a pipeline including deep learning-based methods for rib
+labeling, and a skeletonization-based method for centerline extraction. To
+improve computational efficiency, we propose a sparse point cloud
+representation of CT scans and compare it with standard dense voxel grids.
+Moreover, we design and analyze evaluation metrics to address the key
+challenges of each task. Our dataset, code, and model are available online to
+facilitate open research at https://github.com/M3DV/RibSeg
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimising Event-Driven Spiking Neural Network with Regularisation and
+  Cutoff 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dengyu Wu, Gaojie Jin, Han Yu, Xinping Yi, Xiaowei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs), a variant of artificial neural networks
+(ANNs) with the benefit of energy efficiency, have achieved the accuracy close
+to its ANN counterparts, on benchmark datasets such as CIFAR10/100 and
+ImageNet. However, comparing with frame-based input (e.g., images), event-based
+inputs from e.g., Dynamic Vision Sensor (DVS) can make a better use of SNNs
+thanks to the SNNs' asynchronous working mechanism. In this paper, we
+strengthen the marriage between SNNs and event-based inputs with a proposal to
+consider anytime optimal inference SNNs, or AOI-SNNs, which can terminate
+anytime during the inference to achieve optimal inference result. Two novel
+optimisation techniques are presented to achieve AOI-SNNs: a regularisation and
+a cutoff. The regularisation enables the training and construction of SNNs with
+optimised performance, and the cutoff technique optimises the inference of SNNs
+on event-driven inputs. We conduct an extensive set of experiments on multiple
+benchmark event-based datasets, including CIFAR10-DVS, N-Caltech101 and DVS128
+Gesture. The experimental results demonstrate that our techniques are superior
+to the state-of-the-art with respect to the accuracy and latency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Diffusion Models with Explicit Transition Probability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Huang, Zheng Qin, Xinwang Liu, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities
+of generated content, however, they often suffer from complex forward
+processes, resulting in inefficient solutions for the reversed process and
+prolonged sampling times. In this paper, we aim to address the aforementioned
+challenges by focusing on the diffusion process itself that we propose to
+decouple the intricate diffusion process into two comparatively simpler process
+to improve the generative efficacy and speed. In particular, we present a novel
+diffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito
+diffusion process, in which the image distribution is approximated by an
+explicit transition probability while the noise path is controlled by the
+standard Wiener process. We find that decoupling the diffusion process reduces
+the learning difficulty and the explicit transition probability improves the
+generative speed significantly. We prove a new training objective for DPM,
+which enables the model to learn to predict the noise and image components
+separately. Moreover, given the novel forward diffusion equation, we derive the
+reverse denoising formula of DDM that naturally supports fewer steps of
+generation without ordinary differential equation (ODE) based accelerators. Our
+experiments demonstrate that DDM outperforms previous DPMs by a large margin in
+fewer function evaluations setting and gets comparable performances in long
+function evaluations setting. We also show that our framework can be applied to
+image-conditioned generation and high-resolution image synthesis, and that it
+can generate high-quality images with only 10 function evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Seeing Behind Dynamic Occlusions with Event Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rong Zou, Manasi Muglikar, Nico Messikommer, Davide Scaramuzza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unwanted camera occlusions, such as debris, dust, rain-drops, and snow, can
+severely degrade the performance of computer-vision systems. Dynamic occlusions
+are particularly challenging because of the continuously changing pattern.
+Existing occlusion-removal methods currently use synthetic aperture imaging or
+image inpainting. However, they face issues with dynamic occlusions as these
+require multiple viewpoints or user-generated masks to hallucinate the
+background intensity. We propose a novel approach to reconstruct the background
+from a single viewpoint in the presence of dynamic occlusions. Our solution
+relies for the first time on the combination of a traditional camera with an
+event camera. When an occlusion moves across a background image, it causes
+intensity changes that trigger events. These events provide additional
+information on the relative intensity changes between foreground and background
+at a high temporal resolution, enabling a truer reconstruction of the
+background content. We present the first large-scale dataset consisting of
+synchronized images and event sequences to evaluate our approach. We show that
+our method outperforms image inpainting methods by 3dB in terms of PSNR on our
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phase Matching for Out-of-Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Yeqian Du, Rui Wang, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier transform, serving as an explicit decomposition method for visual
+signals, has been employed to explain the out-of-distribution generalization
+behaviors of Convolutional Neural Networks (CNNs). Previous studies have
+indicated that the amplitude spectrum is susceptible to the disturbance caused
+by distribution shifts. On the other hand, the phase spectrum preserves
+highly-structured spatial information, which is crucial for robust visual
+representation learning. However, the spatial relationships of phase spectrum
+remain unexplored in previous researches. In this paper, we aim to clarify the
+relationships between Domain Generalization (DG) and the frequency components,
+and explore the spatial relationships of the phase spectrum. Specifically, we
+first introduce a Fourier-based structural causal model which interprets the
+phase spectrum as semi-causal factors and the amplitude spectrum as non-causal
+factors. Then, we propose Phase Matching (PhaMa) to address DG problems. Our
+method introduces perturbations on the amplitude spectrum and establishes
+spatial relationships to match the phase components. Through experiments on
+multiple benchmarks, we demonstrate that our proposed method achieves
+state-of-the-art performance in domain generalization and out-of-distribution
+robustness tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Phenotype-preserving metric design for high-content image reconstruction
+  by generative inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaibhav Sharma, Artur Yakimovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past decades, automated high-content microscopy demonstrated its
+ability to deliver large quantities of image-based data powering the
+versatility of phenotypic drug screening and systems biology applications.
+However, as the sizes of image-based datasets grew, it became infeasible for
+humans to control, avoid and overcome the presence of imaging and sample
+preparation artefacts in the images. While novel techniques like machine
+learning and deep learning may address these shortcomings through generative
+image inpainting, when applied to sensitive research data this may come at the
+cost of undesired image manipulation. Undesired manipulation may be caused by
+phenomena such as neural hallucinations, to which some artificial neural
+networks are prone. To address this, here we evaluate the state-of-the-art
+inpainting methods for image restoration in a high-content fluorescence
+microscopy dataset of cultured cells with labelled nuclei. We show that
+architectures like DeepFill V2 and Edge Connect can faithfully restore
+microscopy images upon fine-tuning with relatively little data. Our results
+demonstrate that the area of the region to be restored is of higher importance
+than shape. Furthermore, to control for the quality of restoration, we propose
+a novel phenotype-preserving metric design strategy. In this strategy, the size
+and count of the restored biological phenotypes like cell nuclei are quantified
+to penalise undesirable manipulation. We argue that the design principles of
+our approach may also generalise to other applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pair then Relation: Pair-Net for Panoptic Scene Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08699v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08699v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghao Wang, Zhengyu Wen, Xiangtai Li, Zujin Guo, Jingkang Yang, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic Scene Graph (PSG) is a challenging task in Scene Graph Generation
+(SGG) that aims to create a more comprehensive scene graph representation using
+panoptic segmentation instead of boxes. Compared to SGG, PSG has several
+challenging problems: pixel-level segment outputs and full relationship
+exploration (It also considers thing and stuff relation). Thus, current PSG
+methods have limited performance, which hinders downstream tasks or
+applications. The goal of this work aims to design a novel and strong baseline
+for PSG. To achieve that, we first conduct an in-depth analysis to identify the
+bottleneck of the current PSG models, finding that inter-object pair-wise
+recall is a crucial factor that was ignored by previous PSG methods. Based on
+this and the recent query-based frameworks, we present a novel framework: Pair
+then Relation (Pair-Net), which uses a Pair Proposal Network (PPN) to learn and
+filter sparse pair-wise relationships between subjects and objects. Moreover,
+we also observed the sparse nature of object pairs for both Motivated by this,
+we design a lightweight Matrix Learner within the PPN, which directly learn
+pair-wised relationships for pair proposal generation. Through extensive
+ablation and analysis, our approach significantly improves upon leveraging the
+segmenter solid baseline. Notably, our method achieves new state-of-the-art
+results on the PSG benchmark, with over 10\% absolute gains compared to
+PSGFormer. The code of this paper is publicly available at
+https://github.com/king159/Pair-Net.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://github.com/king159/Pair-Net</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beating Backdoor Attack at Its Own Game 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Alberto Sangiovanni-Vincentelli, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not
+affect the network's performance on clean data but would manipulate the network
+behavior once a trigger pattern is added. Existing defense methods have greatly
+reduced attack success rate, but their prediction accuracy on clean data still
+lags behind a clean model by a large margin. Inspired by the stealthiness and
+effectiveness of backdoor attack, we propose a simple but highly effective
+defense framework which injects non-adversarial backdoors targeting poisoned
+samples. Following the general steps in backdoor attack, we detect a small set
+of suspected samples and then apply a poisoning strategy to them. The
+non-adversarial backdoor, once triggered, suppresses the attacker's backdoor on
+poisoned data, but has limited influence on clean data. The defense can be
+carried out during data preprocessing, without any modification to the standard
+end-to-end training pipeline. We conduct extensive experiments on multiple
+benchmarks with different architectures and representative attacks. Results
+demonstrate that our method achieves state-of-the-art defense effectiveness
+with by far the lowest performance drop on clean data. Considering the
+surprising defense ability displayed by our framework, we call for more
+attention to utilizing backdoor for backdoor defense. Code is available at
+https://github.com/damianliumin/non-adversarial_backdoor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SepMark: Deep Separable Watermarking for Unified Source Tracing and
+  Deepfake Detection <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshuai Wu, Xin Liao, Bo Ou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Malicious Deepfakes have led to a sharp conflict over distinguishing between
+genuine and forged faces. Although many countermeasures have been developed to
+detect Deepfakes ex-post, undoubtedly, passive forensics has not considered any
+preventive measures for the pristine face before foreseeable manipulations. To
+complete this forensics ecosystem, we thus put forward the proactive solution
+dubbed SepMark, which provides a unified framework for source tracing and
+Deepfake detection. SepMark originates from encoder-decoder-based deep
+watermarking but with two separable decoders. For the first time the deep
+separable watermarking, SepMark brings a new paradigm to the established study
+of deep watermarking, where a single encoder embeds one watermark elegantly,
+while two decoders can extract the watermark separately at different levels of
+robustness. The robust decoder termed Tracer that resists various distortions
+may have an overly high level of robustness, allowing the watermark to survive
+both before and after Deepfake. The semi-robust one termed Detector is
+selectively sensitive to malicious distortions, making the watermark disappear
+after Deepfake. Only SepMark comprising of Tracer and Detector can reliably
+trace the trusted source of the marked face and detect whether it has been
+altered since being marked; neither of the two alone can achieve this.
+Extensive experiments demonstrate the effectiveness of the proposed SepMark on
+typical Deepfakes, including face swapping, expression reenactment, and
+attribute editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Domain Awareness for Multi-Agent Collaborative
+  Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13929v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13929v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Yang, Dingkang Yang, Jingyu Zhang, Mingcheng Li, Yang Liu, Jing Liu, Hanqi Wang, Peng Sun, Liang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent collaborative perception as a potential application for
+vehicle-to-everything communication could significantly improve the perception
+performance of autonomous vehicles over single-agent perception. However,
+several challenges remain in achieving pragmatic information sharing in this
+emerging research. In this paper, we propose SCOPE, a novel collaborative
+perception framework that aggregates the spatio-temporal awareness
+characteristics across on-road agents in an end-to-end manner. Specifically,
+SCOPE has three distinct strengths: i) it considers effective semantic cues of
+the temporal context to enhance current representations of the target agent;
+ii) it aggregates perceptually critical spatial information from heterogeneous
+agents and overcomes localization errors via multi-scale feature interactions;
+iii) it integrates multi-source representations of the target agent based on
+their complementary contributions by an adaptive fusion paradigm. To thoroughly
+evaluate SCOPE, we consider both real-world and simulated scenarios of
+collaborative 3D object detection tasks on three datasets. Extensive
+experiments demonstrate the superiority of our approach and the necessity of
+the proposed components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>-based Variable-rate Image Compression with
+  Region-of-interest Control <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10807v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10807v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Hao Kao, Ying-Chieh Weng, Yi-Hsin Chen, Wei-Chen Chiu, Wen-Hsiao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a transformer-based learned image compression system. It
+is capable of achieving variable-rate compression with a single model while
+supporting the region-of-interest (ROI) functionality. Inspired by prompt
+tuning, we introduce prompt generation networks to condition the
+transformer-based autoencoder of compression. Our prompt generation networks
+generate content-adaptive tokens according to the input image, an ROI mask, and
+a rate parameter. The separation of the ROI mask and the rate parameter allows
+an intuitive way to achieve variable-rate and ROI coding simultaneously.
+Extensive experiments validate the effectiveness of our proposed method and
+confirm its superiority over the other competing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE ICIP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HL <span class="highlight-title">Dataset</span>: Visually-grounded Description of Scenes, Actions and
+  Rationales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Cafagna, Kees van Deemter, Albert Gatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current captioning datasets focus on object-centric captions, describing the
+visible objects in the image, e.g. "people eating food in a park". Although
+these datasets are useful to evaluate the ability of Vision & Language models
+to recognize and describe visual content, they do not support controlled
+experiments involving model testing or fine-tuning, with more high-level
+captions, which humans find easy and natural to produce. For example, people
+often describe images based on the type of scene they depict ('people at a
+holiday resort') and the actions they perform ('people having a picnic'). Such
+descriptions draw on personal experience and commonsense assumptions. We
+present the High-Level Dataset a dataset extending 14997 images from the COCO
+dataset, aligned with a new set of 134,973 human-annotated (high-level)
+captions collected along three axes: scenes, actions, and rationales. We
+further extend this dataset with confidence scores collected from an
+independent set of readers, as well as a set of narrative captions generated
+synthetically, by combining each of the three axes. We describe this dataset
+and analyse it extensively. We also present baseline results for the High-Level
+Captioning task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking <span class="highlight-title">Dataset</span> for
+  Assistive Driving Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingkang Yang, Shuai Huang, Zhi Xu, Zhenpeng Li, Shunli Wang, Mingcheng Li, Yuzheng Wang, Yang Liu, Kun Yang, Zhaoyu Chen, Yan Wang, Jing Liu, Peixuan Zhang, Peng Zhai, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driver distraction has become a significant cause of severe traffic accidents
+over the past decade. Despite the growing development of vision-driven driver
+monitoring systems, the lack of comprehensive perception datasets restricts
+road safety and traffic security. In this paper, we present an AssIstive
+Driving pErception dataset (AIDE) that considers context information both
+inside and outside the vehicle in naturalistic scenarios. AIDE facilitates
+holistic driver monitoring through three distinctive characteristics, including
+multi-view settings of driver and scene, multi-modal annotations of face, body,
+posture, and gesture, and four pragmatic task designs for driving
+understanding. To thoroughly explore AIDE, we provide experimental benchmarks
+on three kinds of baseline frameworks via extensive methods. Moreover, two
+fusion strategies are introduced to give new insights into learning effective
+multi-stream/modal representations. We also systematically investigate the
+importance and rationality of the key components in AIDE and benchmarks. The
+project link is https://github.com/ydk122024/AIDE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Prototype Transport for Zero-Shot Action Recognition and
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.03971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.03971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Mettes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work addresses the problem of recognizing action categories in videos
+when no training examples are available. The current state-of-the-art enables
+such a zero-shot recognition by learning universal mappings from videos to a
+semantic space, either trained on large-scale seen actions or on objects. While
+effective, we find that universal action and object mappings are biased to
+specific regions in the semantic space. These biases lead to a fundamental
+problem: many unseen action categories are simply never inferred during
+testing. For example on UCF-101, a quarter of the unseen actions are out of
+reach with a state-of-the-art universal action model. To that end, this paper
+introduces universal prototype transport for zero-shot action recognition. The
+main idea is to re-position the semantic prototypes of unseen actions by
+matching them to the distribution of all test videos. For universal action
+models, we propose to match distributions through a hyperspherical optimal
+transport from unseen action prototypes to the set of all projected test
+videos. The resulting transport couplings in turn determine the target
+prototype for each unseen action. Rather than directly using the target
+prototype as final result, we re-position unseen action prototypes along the
+geodesic spanned by the original and target prototypes as a form of semantic
+regularization. For universal object models, we outline a variant that defines
+target prototypes based on an optimal transport between unseen action
+prototypes and object prototypes. Empirically, we show that universal prototype
+transport diminishes the biased selection of unseen action prototypes and
+boosts both universal action and object models for zero-shot classification and
+spatio-temporal localization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MotionBEV: Attention-Aware Online LiDAR Moving Object Segmentation with
+  Bird's Eye View based Appearance and Motion Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhou, Jiapeng Xie, Yan Pan, Jiajie Wu, Chuanzhao Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying moving objects is an essential capability for autonomous systems,
+as it provides critical information for pose estimation, navigation, collision
+avoidance, and static map construction. In this paper, we present MotionBEV, a
+fast and accurate framework for LiDAR moving object segmentation, which
+segments moving objects with appearance and motion features in the bird's eye
+view (BEV) domain. Our approach converts 3D LiDAR scans into a 2D polar BEV
+representation to improve computational efficiency. Specifically, we learn
+appearance features with a simplified PointNet and compute motion features
+through the height differences of consecutive frames of point clouds projected
+onto vertical columns in the polar BEV coordinate system. We employ a
+dual-branch network bridged by the Appearance-Motion Co-attention Module (AMCM)
+to adaptively fuse the spatio-temporal information from appearance and motion
+features. Our approach achieves state-of-the-art performance on the
+SemanticKITTI-MOS benchmark. Furthermore, to demonstrate the practical
+effectiveness of our method, we provide a LiDAR-MOS dataset recorded by a
+solid-state LiDAR, which features non-repetitive scanning patterns and a small
+field of view.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GELU Activation Function in Deep Learning: A Comprehensive Mathematical
+  Analysis and Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhyeok Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selecting the most suitable activation function is a critical factor in the
+effectiveness of deep learning models, as it influences their learning
+capacity, stability, and computational efficiency. In recent years, the
+Gaussian Error Linear Unit (GELU) activation function has emerged as a dominant
+method, surpassing traditional functions such as the Rectified Linear Unit
+(ReLU) in various applications. This study presents a rigorous mathematical
+investigation of the GELU activation function, exploring its differentiability,
+boundedness, stationarity, and smoothness properties in detail. Additionally,
+we conduct an extensive experimental comparison of the GELU function against a
+broad range of alternative activation functions, utilizing a residual
+convolutional network trained on the CIFAR-10, CIFAR-100, and STL-10 datasets
+as the empirical testbed. Our results demonstrate the superior performance of
+GELU compared to other activation functions, establishing its suitability for a
+wide range of deep learning applications. This comprehensive study contributes
+to a more profound understanding of the underlying mathematical properties of
+GELU and provides valuable insights for practitioners aiming to select
+activation functions that optimally align with their specific objectives and
+constraints in deep learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MapFormer: Boosting Change Detection by Using Pre-change Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17859v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17859v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Bernhard, Niklas Strauß, Matthias Schubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection in remote sensing imagery is essential for a variety of
+applications such as urban planning, disaster management, and climate research.
+However, existing methods for identifying semantically changed areas overlook
+the availability of semantic information in the form of existing maps
+describing features of the earth's surface. In this paper, we leverage this
+information for change detection in bi-temporal images. We show that the simple
+integration of the additional information via concatenation of latent
+representations suffices to significantly outperform state-of-the-art change
+detection methods. Motivated by this observation, we propose the new task of
+*Conditional Change Detection*, where pre-change semantic information is used
+as input next to bi-temporal images. To fully exploit the extra information, we
+propose *MapFormer*, a novel architecture based on a multi-modal feature fusion
+module that allows for feature processing conditioned on the available semantic
+information. We further employ a supervised, cross-modal contrastive loss to
+guide the learning of visual representations. Our approach outperforms existing
+change detection methods by an absolute 11.7\% and 18.4\% in terms of binary
+change IoU on DynamicEarthNet and HRSCD, respectively. Furthermore, we
+demonstrate the robustness of our approach to the quality of the pre-change
+semantic information and the absence pre-change imagery. The code is available
+at https://github.com/mxbh/mapformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-based inexact graph matching on top of CNNs for semantic scene
+  understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07468v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07468v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Chopin, Jean-Baptiste Fasquel, Harold Mouchère, Rozenn Dahyot, Isabelle Bloch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning based pipelines for semantic segmentation often ignore
+structural information available on annotated images used for training. We
+propose a novel post-processing module enforcing structural knowledge about the
+objects of interest to improve segmentation results provided by deep learning.
+This module corresponds to a "many-to-one-or-none" inexact graph matching
+approach, and is formulated as a quadratic assignment problem. Our approach is
+compared to a CNN-based segmentation (for various CNN backbones) on two public
+datasets, one for face segmentation from 2D RGB images (FASSEG), and the other
+for brain segmentation from 3D MRIs (IBSR). Evaluations are performed using two
+types of structural information (distances and directional relations, , this
+choice being a hyper-parameter of our generic framework). On FASSEG data,
+results show that our module improves accuracy of the CNN by about 6.3% (the
+Hausdorff distance decreases from 22.11 to 20.71). On IBSR data, the
+improvement is of 51% (the Hausdorff distance decreases from 11.01 to 5.4). In
+addition, our approach is shown to be resilient to small training datasets that
+often limit the performance of deep learning methods: the improvement increases
+as the size of the training dataset decreases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 9 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Controlled and Conditional Text to Image Generation with Diffusion Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Aggarwal, Hareesh Ravi, Naveen Marri, Sachin Kelkar, Fengbin Chen, Vinh Khuc, Midhun Harikumar, Ritiz Tambi, Sudharshan Reddy Kakumanu, Purvak Lapsiya, Alvin Ghouas, Sarah Saber, Malavika Ramprasad, Baldo Faieta, Ajinkya Kale
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising Diffusion models have shown remarkable performance in generating
+diverse, high quality images from text. Numerous techniques have been proposed
+on top of or in alignment with models like Stable Diffusion and Imagen that
+generate images directly from text. A lesser explored approach is DALLE-2's two
+step process comprising a Diffusion Prior that generates a CLIP image embedding
+from text and a Diffusion Decoder that generates an image from a CLIP image
+embedding. We explore the capabilities of the Diffusion Prior and the
+advantages of an intermediate CLIP representation. We observe that Diffusion
+Prior can be used in a memory and compute efficient way to constrain the
+generation to a specific domain without altering the larger Diffusion Decoder.
+Moreover, we show that the Diffusion Prior can be trained with additional
+conditional information such as color histogram to further control the
+generation. We show quantitatively and qualitatively that the proposed
+approaches perform better than prompt engineering for domain specific
+generation and existing baselines for color conditioned generation. We believe
+that our observations and results will instigate further research into the
+diffusion prior and uncover more of its capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Deep Graph Matching Based on Cycle Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08930v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08930v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Tourani, Carsten Rother, Muhammad Haris Khan, Bogdan Savchynskyy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We contribute to the sparsely populated area of unsupervised deep graph
+matching with application to keypoint matching in images. Contrary to the
+standard \emph{supervised} approach, our method does not require ground truth
+correspondences between keypoint pairs. Instead, it is self-supervised by
+enforcing consistency of matchings between images of the same object category.
+As the matching and the consistency loss are discrete, their derivatives cannot
+be straightforwardly used for learning. We address this issue in a principled
+way by building our method upon the recent results on black-box differentiation
+of combinatorial solvers. This makes our method exceptionally flexible, as it
+is compatible with arbitrary network architectures and combinatorial solvers.
+Our experimental evaluation suggests that our technique sets a new
+state-of-the-art for unsupervised graph matching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures, 3 papers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Semantic Subspace Traverser: Empowering 3D Generative Model with
+  Shape Editing Capability <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14051v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14051v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruowei Wang, Yu Liu, Pei Su, Jianwei Zhang, Qijun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape generation is the practice of producing 3D shapes as various
+representations for 3D content creation. Previous studies on 3D shape
+generation have focused on shape quality and structure, without or less
+considering the importance of semantic information. Consequently, such
+generative models often fail to preserve the semantic consistency of shape
+structure or enable manipulation of the semantic attributes of shapes during
+generation. In this paper, we proposed a novel semantic generative model named
+3D Semantic Subspace Traverser that utilizes semantic attributes for
+category-specific 3D shape generation and editing. Our method utilizes implicit
+functions as the 3D shape representation and combines a novel latent-space GAN
+with a linear subspace model to discover semantic dimensions in the local
+latent space of 3D shapes. Each dimension of the subspace corresponds to a
+particular semantic attribute, and we can edit the attributes of generated
+shapes by traversing the coefficients of those dimensions. Experimental results
+demonstrate that our method can produce plausible shapes with complex
+structures and enable the editing of semantic attributes. The code and trained
+models are available at
+https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICCV 2023. Code:
+  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Critical Look at the Current Usage of Foundation Model for Dense
+  Recognition Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02862v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02862v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Yang, Atsushi Hashimoto, Yoshitaka Ushiku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years large model trained on huge amount of cross-modality data,
+which is usually be termed as foundation model, achieves conspicuous
+accomplishment in many fields, such as image recognition and generation. Though
+achieving great success in their original application case, it is still unclear
+whether those foundation models can be applied to other different downstream
+tasks. In this paper, we conduct a short survey on the current methods for
+discriminative dense recognition tasks, which are built on the pretrained
+foundation model. And we also provide some preliminary experimental analysis of
+an existing open-vocabulary segmentation method based on Stable Diffusion,
+which indicates the current way of deploying diffusion model for segmentation
+is not optimal. This aims to provide insights for future research on adopting
+foundation model for downstream task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a short report on the current usage of foundation model
+  (mainly pretrained diffusion model) for downstream dense recognition task
+  (e.g., open vocabulary segmentation). We hope this short report could give an
+  insight to the future research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subspace Distillation for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushik Roy, Christian Simon, Peyman Moghadam, Mehrtash Harandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An ultimate objective in continual learning is to preserve knowledge learned
+in preceding tasks while learning new tasks. To mitigate forgetting prior
+knowledge, we propose a novel knowledge distillation technique that takes into
+the account the manifold structure of the latent/output space of a neural
+network in learning novel tasks. To achieve this, we propose to approximate the
+data manifold up-to its first order, hence benefiting from linear subspaces to
+model the structure and maintain the knowledge of a neural network while
+learning novel concepts. We demonstrate that the modeling with subspaces
+provides several intriguing properties, including robustness to noise and
+therefore effective for mitigating Catastrophic Forgetting in continual
+learning. We also discuss and show how our proposed method can be adopted to
+address both classification and segmentation problems. Empirically, we observe
+that our proposed method outperforms various continual learning methods on
+several challenging datasets including Pascal VOC, and Tiny-Imagenet.
+Furthermore, we show how the proposed method can be seamlessly combined with
+existing learning approaches to improve their performances. The codes of this
+article will be available at https://github.com/csiro-robotics/SDCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neural Networks (submitted May 2022, accepted July 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Uncertainly Missing and Ambiguous Visual Modality in
+  Multi-Modal Entity Alignment <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Chen, Lingbing Guo, Yin Fang, Yichi Zhang, Jiaoyan Chen, Jeff Z. Pan, Yangning Li, Huajun Chen, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a crucial extension of entity alignment (EA), multi-modal entity alignment
+(MMEA) aims to identify identical entities across disparate knowledge graphs
+(KGs) by exploiting associated visual information. However, existing MMEA
+approaches primarily concentrate on the fusion paradigm of multi-modal entity
+features, while neglecting the challenges presented by the pervasive phenomenon
+of missing and intrinsic ambiguity of visual images. In this paper, we present
+a further analysis of visual modality incompleteness, benchmarking latest MMEA
+models on our proposed dataset MMEA-UMVM, where the types of alignment KGs
+covering bilingual and monolingual, with standard (non-iterative) and iterative
+training paradigms to evaluate the model performance. Our research indicates
+that, in the face of modality incompleteness, models succumb to overfitting the
+modality noise, and exhibit performance oscillations or declines at high rates
+of missing modality. This proves that the inclusion of additional multi-modal
+data can sometimes adversely affect EA. To address these challenges, we
+introduce UMAEA , a robust multi-modal entity alignment approach designed to
+tackle uncertainly missing and ambiguous visual modalities. It consistently
+achieves SOTA performance across all 97 benchmark splits, significantly
+surpassing existing baselines with limited parameters and time consumption,
+while effectively alleviating the identified limitations of other models. Our
+code and benchmark data are available at https://github.com/zjukg/UMAEA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Semantic Web Conference '23 (ISWC 2023),
+  https://github.com/zjukg/UMAEA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RdSOBA: Rendered Shadow-Object Association <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Tao, Junyan Cao, Li Niu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image composition refers to inserting a foreground object into a background
+image to obtain a composite image. In this work, we focus on generating
+plausible shadows for the inserted foreground object to make the composite
+image more realistic. To supplement the existing small-scale dataset DESOBA, we
+created a large-scale dataset called RdSOBA with 3D rendering techniques.
+Specifically, we place a group of 3D objects in the 3D scene, and get the
+images without or with object shadows using controllable rendering techniques.
+Dataset is available at
+https://github.com/bcmi/Rendered-Shadow-Generation-Dataset-RdSOBA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NLOS-NeuS: Non-line-of-sight Neural Implicit Surface <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Fujimura, Takahiro Kushida, Takuya Funatomi, Yasuhiro Mukaigawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-line-of-sight (NLOS) imaging is conducted to infer invisible scenes from
+indirect light on visible objects. The neural transient field (NeTF) was
+proposed for representing scenes as neural radiance fields in NLOS scenes. We
+propose NLOS neural implicit surface (NLOS-NeuS), which extends the NeTF to
+neural implicit surfaces with a signed distance function (SDF) for
+reconstructing three-dimensional surfaces in NLOS scenes. We introduce two
+constraints as loss functions for correctly learning an SDF to avoid non-zero
+level-set surfaces. We also introduce a lower bound constraint of an SDF based
+on the geometry of the first-returning photons. The experimental results
+indicate that these constraints are essential for learning a correct SDF in
+NLOS scenes. Compared with previous methods with discretized representation,
+NLOS-NeuS with the neural continuous representation enables us to reconstruct
+smooth surfaces while preserving fine details in NLOS scenes. To the best of
+our knowledge, this is the first study on neural implicit surfaces with volume
+rendering in NLOS scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Benchmark for the Unknown Detection Capability of Deep Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.00337v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.00337v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyo Kim, Jiin Koo, Sangheum Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have achieved outstanding performance over various
+tasks, but they have a critical issue: over-confident predictions even for
+completely unknown samples. Many studies have been proposed to successfully
+filter out these unknown samples, but they only considered narrow and specific
+tasks, referred to as misclassification detection, open-set recognition, or
+out-of-distribution detection. In this work, we argue that these tasks should
+be treated as fundamentally an identical problem because an ideal model should
+possess detection capability for all those tasks. Therefore, we introduce the
+unknown detection task, an integration of previous individual tasks, for a
+rigorous examination of the detection capability of deep neural networks on a
+wide spectrum of unknown samples. To this end, unified benchmark datasets on
+different scales were constructed and the unknown detection capabilities of
+existing popular methods were subject to comparison. We found that Deep
+Ensemble consistently outperforms the other approaches in detecting unknowns;
+however, all methods are only successful for a specific type of unknown. The
+reproducible code and benchmark datasets are available at
+https://github.com/daintlab/unknown-detection-benchmarks .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ESWA
+  (https://www.sciencedirect.com/science/article/pii/S0957417423009636)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D VR Sketch Guided 3D Shape Prototyping and Exploration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10830v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10830v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Luo, Pinaki Nath Chowdhury, Tao Xiang, Yi-Zhe Song, Yulia Gryaditskaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D shape modeling is labor-intensive, time-consuming, and requires years of
+expertise. To facilitate 3D shape modeling, we propose a 3D shape generation
+network that takes a 3D VR sketch as a condition. We assume that sketches are
+created by novices without art training and aim to reconstruct geometrically
+realistic 3D shapes of a given category. To handle potential sketch ambiguity,
+our method creates multiple 3D shapes that align with the original sketch's
+structure. We carefully design our method, training the model step-by-step and
+leveraging multi-modal 3D shape representation to support training with limited
+training data. To guarantee the realism of generated 3D shapes we leverage the
+normalizing flow that models the distribution of the latent space of 3D shapes.
+To encourage the fidelity of the generated 3D shapes to an input sketch, we
+propose a dedicated loss that we deploy at different stages of the training
+process. The code is available at https://github.com/Rowl1ng/3Dsketch2shape.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Causal Scene Refinement for Video Question Answering <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yushen Wei, Yang Liu, Hong Yan, Guanbin Li, Liang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods for video question answering (VideoQA) often suffer from
+spurious correlations between different modalities, leading to a failure in
+identifying the dominant visual evidence and the intended question. Moreover,
+these methods function as black boxes, making it difficult to interpret the
+visual scene during the QA process. In this paper, to discover critical video
+segments and frames that serve as the visual causal scene for generating
+reliable answers, we present a causal analysis of VideoQA and propose a
+framework for cross-modal causal relational reasoning, named Visual Causal
+Scene Refinement (VCSR). Particularly, a set of causal front-door intervention
+operations is introduced to explicitly find the visual causal scenes at both
+segment and frame levels. Our VCSR involves two essential modules: i) the
+Question-Guided Refiner (QGR) module, which refines consecutive video frames
+guided by the question semantics to obtain more representative segment features
+for causal front-door intervention; ii) the Causal Scene Separator (CSS)
+module, which discovers a collection of visual causal and non-causal scenes
+based on the visual-linguistic causal relevance and estimates the causal effect
+of the scene-separating intervention in a contrastive learning manner.
+Extensive experiments on the NExT-QA, Causal-VidQA, and MSRVTT-QA datasets
+demonstrate the superiority of our VCSR in discovering visual causal scene and
+achieving robust video question answering. The code is available at
+https://github.com/YangLiu9208/VCSR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FBLNet: FeedBack Loop Network for Driver Attention Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02096v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02096v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilong Chen, Zhixiong Nan, Tao Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of predicting driver attention from the driving perspective is
+gaining increasing research focus due to its remarkable significance for
+autonomous driving and assisted driving systems. The driving experience is
+extremely important for safe driving,a skilled driver is able to effortlessly
+predict oncoming danger (before it becomes salient) based on the driving
+experience and quickly pay attention to the corresponding zones.However, the
+nonobjective driving experience is difficult to model, so a mechanism
+simulating the driver experience accumulation procedure is absent in existing
+methods, and the current methods usually follow the technique line of saliency
+prediction methods to predict driver attention. In this paper, we propose a
+FeedBack Loop Network (FBLNet), which attempts to model the driving experience
+accumulation procedure. By over-and-over iterations, FBLNet generates the
+incremental knowledge that carries rich historically-accumulative and long-term
+temporal information. The incremental knowledge in our model is like the
+driving experience of humans. Under the guidance of the incremental knowledge,
+our model fuses the CNN feature and Transformer feature that are extracted from
+the input image to predict driver attention. Our model exhibits a solid
+advantage over existing methods, achieving an outstanding performance
+improvement on two driver attention benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaskBEV: Joint Object Detection and Footprint Completion for Bird's-eye
+  View 3D Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Guimont-Martin, Jean-Michel Fortin, François Pomerleau, Philippe Giguère
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works in object detection in LiDAR point clouds mostly focus on
+predicting bounding boxes around objects. This prediction is commonly achieved
+using anchor-based or anchor-free detectors that predict bounding boxes,
+requiring significant explicit prior knowledge about the objects to work
+properly. To remedy these limitations, we propose MaskBEV, a bird's-eye view
+(BEV) mask-based object detector neural architecture. MaskBEV predicts a set of
+BEV instance masks that represent the footprints of detected objects. Moreover,
+our approach allows object detection and footprint completion in a single pass.
+MaskBEV also reformulates the detection problem purely in terms of
+classification, doing away with regression usually done to predict bounding
+boxes. We evaluate the performance of MaskBEV on both SemanticKITTI and KITTI
+datasets while analyzing the architecture advantages and limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion Degeneracy in <span class="highlight-title">Self-supervised</span> Learning of Elevation Angle
+  Estimation for 2D Forward-Looking Sonar <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusheng Wang, Yonghoon Ji, Chujie Wu, Hiroshi Tsuchiya, Hajime Asama, Atsushi Yamashita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  2D forward-looking sonar is a crucial sensor for underwater robotic
+perception. A well-known problem in this field is estimating missing
+information in the elevation direction during sonar imaging. There are demands
+to estimate 3D information per image for 3D mapping and robot navigation during
+fly-through missions. Recent learning-based methods have demonstrated their
+strengths, but there are still drawbacks. Supervised learning methods have
+achieved high-quality results but may require further efforts to acquire 3D
+ground-truth labels. The existing self-supervised method requires pretraining
+using synthetic images with 3D supervision. This study aims to realize stable
+self-supervised learning of elevation angle estimation without pretraining
+using synthetic images. Failures during self-supervised learning may be caused
+by motion degeneracy problems. We first analyze the motion field of 2D
+forward-looking sonar, which is related to the main supervision signal. We
+utilize a modern learning framework and prove that if the training dataset is
+built with effective motions, the network can be trained in a self-supervised
+manner without the knowledge of synthetic data. Both simulation and real
+experiments validate the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics <span class="chip">ACM MM 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Peike Li, Xingqun Qi, Hu Zhang, Lincheng Li, Dadong Wang, Xin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The audio-visual segmentation (AVS) task aims to segment sounding objects
+from a given video. Existing works mainly focus on fusing audio and visual
+features of a given video to achieve sounding object masks. However, we
+observed that prior arts are prone to segment a certain salient object in a
+video regardless of the audio information. This is because sounding objects are
+often the most salient ones in the AVS dataset. Thus, current AVS methods might
+fail to localize genuine sounding objects due to the dataset bias. In this
+work, we present an audio-visual instance-aware segmentation approach to
+overcome the dataset bias. In a nutshell, our method first localizes potential
+sounding objects in a video by an object segmentation network, and then
+associates the sounding object candidates with the given audio. We notice that
+an object could be a sounding object in one video but a silent one in another
+video. This would bring ambiguity in training our object segmentation network
+as only sounding objects have corresponding segmentation masks. We thus propose
+a silent object-aware segmentation objective to alleviate the ambiguity.
+Moreover, since the category information of audio is unknown, especially for
+multiple sounding sources, we propose to explore the audio-visual semantic
+correlation and then associate audio with potential objects. Specifically, we
+attend predicted audio category scores to potential instance masks and these
+scores will highlight corresponding sounding instances while suppressing
+inaudible ones. When we enforce the attended instance masks to resemble the
+ground-truth mask, we are able to establish audio-visual semantics correlation.
+Experimental results on the AVS benchmarks demonstrate that our method can
+effectively segment sounding objects without being biased to salient objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been received by ACM MM 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structure-Preserving Synthesis: MaskGAN for Unpaired MR-CT Translation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh Hieu Phan, Zhibin Liao, Johan W. Verjans, Minh-Son To
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image synthesis is a challenging task due to the scarcity of paired
+data. Several methods have applied CycleGAN to leverage unpaired data, but they
+often generate inaccurate mappings that shift the anatomy. This problem is
+further exacerbated when the images from the source and target modalities are
+heavily misaligned. Recently, current methods have aimed to address this issue
+by incorporating a supplementary segmentation network. Unfortunately, this
+strategy requires costly and time-consuming pixel-level annotations. To
+overcome this problem, this paper proposes MaskGAN, a novel and cost-effective
+framework that enforces structural consistency by utilizing automatically
+extracted coarse masks. Our approach employs a mask generator to outline
+anatomical structures and a content generator to synthesize CT contents that
+align with these structures. Extensive experiments demonstrate that MaskGAN
+outperforms state-of-the-art synthesis methods on a challenging pediatric
+dataset, where MR and CT scans are heavily misaligned due to rapid growth in
+children. Specifically, MaskGAN excels in preserving anatomical structures
+without the need for expert annotations. The code for this paper can be found
+at https://github.com/HieuPhan33/MaskGAN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DISPEL: Domain Generalization via Domain-Specific Liberating 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07181v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07181v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Yuan Chang, Yu-Neng Chuang, Guanchu Wang, Mengnan Du, Na Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain generalization aims to learn a generalization model that can perform
+well on unseen test domains by only training on limited source domains.
+However, existing domain generalization approaches often bring in
+prediction-irrelevant noise or require the collection of domain labels. To
+address these challenges, we consider the domain generalization problem from a
+different perspective by categorizing underlying feature groups into
+domain-shared and domain-specific features. Nevertheless, the domain-specific
+features are difficult to be identified and distinguished from the input data.
+In this work, we propose DomaIn-SPEcific Liberating (DISPEL), a post-processing
+fine-grained masking approach that can filter out undefined and
+indistinguishable domain-specific features in the embedding space.
+Specifically, DISPEL utilizes a mask generator that produces a unique mask for
+each input data to filter domain-specific features. The DISPEL framework is
+highly flexible to be applied to any fine-tuned models. We derive a
+generalization error bound to guarantee the generalization performance by
+optimizing a designed objective loss. The experimental results on five
+benchmarks demonstrate DISPEL outperforms existing methods and can further
+generalize various algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Occlusion-Resistant LiDAR Fiducial Marker Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.01072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.01072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Liu, Jinjun Shan, Hunter Schofield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The LiDAR fiducial marker, akin to the well-known AprilTag used in camera
+applications, serves as a convenient resource to impart artificial features to
+the LiDAR sensor, facilitating robotics applications. Unfortunately, current
+LiDAR fiducial marker detection methods are limited to occlusion-free point
+clouds. In this work, we present a novel approach for occlusion-resistant LiDAR
+fiducial marker detection. We first extract 3D points potentially corresponding
+to the markers, leveraging the 3D intensity gradients. Afterward, we analyze
+the 3D spatial distribution of the extracted points through clustering.
+Subsequently, we determine the potential marker locations by examining the
+geometric characteristics of these clusters. We then successively transfer the
+3D points that fall within the candidate locations from the raw point cloud
+onto a designed intermediate plane. Finally, using the intermediate plane, we
+validate each location for the presence of a fiducial marker and compute the
+marker's pose if found. We conduct both qualitative and quantitative
+experiments to demonstrate that our approach is the first LiDAR fiducial marker
+detection method applicable to point clouds with occlusion while achieving
+better accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Large-Scale Visual Representation Learning And Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13399v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13399v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eden Dolev, Alaa Awad, Denisa Roberts, Zahra Ebrahimzadeh, Marcin Mejran, Vaibhav Malpani, Mahir Yavuz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently learning visual representations of items is vital for large-scale
+recommendations. In this article we compare several pretrained efficient
+backbone architectures, both in the convolutional neural network (CNN) and in
+the vision transformer (ViT) family. We describe challenges in e-commerce
+vision applications at scale and highlight methods to efficiently train,
+evaluate, and serve visual representations. We present ablation studies
+evaluating visual representations in several downstream tasks. To this end, we
+present a novel multilingual text-to-image generative offline evaluation method
+for visually similar recommendation systems. Finally, we include online results
+from deployed machine learning systems in production on a large scale
+e-commerce platform.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ F2BEV: Bird's Eye View Generation from Surround-View Fisheye Camera
+  Images for Automated Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03651v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03651v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekta U. Samani, Feng Tao, Harshavardhan R. Dasari, Sihao Ding, Ashis G. Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bird's Eye View (BEV) representations are tremendously useful for
+perception-related automated driving tasks. However, generating BEVs from
+surround-view fisheye camera images is challenging due to the strong
+distortions introduced by such wide-angle lenses. We take the first step in
+addressing this challenge and introduce a baseline, F2BEV, to generate
+discretized BEV height maps and BEV semantic segmentation maps from fisheye
+images. F2BEV consists of a distortion-aware spatial cross attention module for
+querying and consolidating spatial information from fisheye image features in a
+transformer-style architecture followed by a task-specific head. We evaluate
+single-task and multi-task variants of F2BEV on our synthetic FB-SSEM dataset,
+all of which generate better BEV height and segmentation maps (in terms of the
+IoU) than a state-of-the-art BEV generation method operating on undistorted
+fisheye images. We also demonstrate discretized height map generation from
+real-world fisheye images using F2BEV. Our dataset is publicly available at
+https://github.com/volvo-cars/FB-SSEM-dataset
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the proceedings of IEEE/RSJ International
+  Conference on Intelligent Robots and Systems 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Apparent Diffusion Coefficient Maps from Accelerated Radial
+  k-Space Diffusion-Weighted MRI in Mice using a Deep CNN-<span class="highlight-title">Transformer</span> Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.02399v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.02399v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuemeng Li, Miguel Romanello Joaquim, Stephen Pickup, Hee Kwon Song, Rong Zhou, Yong Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: To accelerate radially sampled diffusion weighted spin-echo
+(Rad-DW-SE) acquisition method for generating high quality apparent diffusion
+coefficient (ADC) maps. Methods: A deep learning method was developed to
+generate accurate ADC maps from accelerated DWI data acquired with the
+Rad-DW-SE method. The deep learning method integrates convolutional neural
+networks (CNNs) with vision transformers to generate high quality ADC maps from
+accelerated DWI data, regularized by a monoexponential ADC model fitting term.
+A model was trained on DWI data of 147 mice and evaluated on DWI data of 36
+mice, with acceleration factors of 4x and 8x compared to the original
+acquisition parameters. We have made our code publicly available at GitHub:
+https://github.com/ymli39/DeepADC-Net-Learning-Apparent-Diffusion-Coefficient-Maps,
+and our dataset can be downloaded at
+https://pennpancreaticcancerimagingresource.github.io/data.html. Results:
+Ablation studies and experimental results have demonstrated that the proposed
+deep learning model generates higher quality ADC maps from accelerated DWI data
+than alternative deep learning methods under comparison when their performance
+is quantified in whole images as well as in regions of interest, including
+tumors, kidneys, and muscles. Conclusions: The deep learning method with
+integrated CNNs and transformers provides an effective means to accurately
+compute ADC maps from accelerated DWI data acquired with the Rad-DW-SE method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Magnetic Resonance in Medicine</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DomainStudio: Fine-Tuning Diffusion Models for Domain-Driven Image
+  Generation using Limited Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14153v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14153v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Zhu, Huimin Ma, Jiansheng Chen, Jian Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising diffusion probabilistic models (DDPMs) have been proven capable of
+synthesizing high-quality images with remarkable diversity when trained on
+large amounts of data. Typical diffusion models and modern large-scale
+conditional generative models like text-to-image generative models are
+vulnerable to overfitting when fine-tuned on extremely limited data. Existing
+works have explored subject-driven generation using a reference set containing
+a few images. However, few prior works explore DDPM-based domain-driven
+generation, which aims to learn the common features of target domains while
+maintaining diversity. This paper proposes a novel DomainStudio approach to
+adapt DDPMs pre-trained on large-scale source datasets to target domains using
+limited data. It is designed to keep the diversity of subjects provided by
+source domains and get high-quality and diverse adapted samples in target
+domains. We propose to keep the relative distances between adapted samples to
+achieve considerable generation diversity. In addition, we further enhance the
+learning of high-frequency details for better generation quality. Our approach
+is compatible with both unconditional and conditional diffusion models. This
+work makes the first attempt to realize unconditional few-shot image generation
+with diffusion models, achieving better quality and greater diversity than
+current state-of-the-art GAN-based approaches. Moreover, this work also
+significantly relieves overfitting for conditional generation and realizes
+high-quality domain-driven generation, further expanding the applicable
+scenarios of modern large-scale text-to-image models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>extended from DDPM-PA (arXiv:2211.03264), 33 pages, 34 figures,
+  Update the personalization of DomainStudio</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimePool: Visually Answer "Which and When" Questions On Univariate Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tinghao Feng, Yueqi Hu, Jing Yang, Tom Polk, Ye Zhao, Shixia Liu, Zhaocong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When exploring time series datasets, analysts often pose "which and when"
+questions. For example, with world life expectancy data over one hundred years,
+they may inquire about the top 10 countries in life expectancy and the time
+period when they achieved this status, or which countries have had longer life
+expectancy than Ireland and when. This paper proposes TimePool, a new
+visualization prototype, to address this need for univariate time series
+analysis. It allows users to construct interactive "which and when" queries and
+visually explore the results for insights.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Graph Spectral Clustering of Text Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bartłomiej Starosta, Mieczysław A. Kłopotek, Sławomir T. Wierzchoń
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectral clustering methods are known for their ability to represent clusters
+of diverse shapes, densities etc. However, results of such algorithms, when
+applied e.g. to text documents, are hard to explain to the user, especially due
+to embedding in the spectral space which has no obvious relation to document
+contents. Therefore there is an urgent need to elaborate methods for explaining
+the outcome of the clustering. This paper presents a contribution towards this
+goal. We present a proposal of explanation of results of combinatorial
+Laplacian based graph spectral clustering. It is based on showing (approximate)
+equivalence of combinatorial Laplacian embedding, $K$-embedding (proposed in
+this paper) and term vector space embedding. Hence a bridge is constructed
+between the textual contents and the clustering results. We provide theoretical
+background for this approach. We performed experimental study showing that
+$K$-embedding approximates well Laplacian embedding under favourable block
+matrix conditions and show that approximation is good enough under other
+conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 figures, 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Effects of Regional Spelling Conventions in Retrieval Models <span class="chip">SIGIR '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Chari, Sean MacAvaney, Iadh Ounis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One advantage of neural ranking models is that they are meant to generalise
+well in situations of synonymity i.e. where two words have similar or identical
+meanings. In this paper, we investigate and quantify how well various ranking
+models perform in a clear-cut case of synonymity: when words are simply
+expressed in different surface forms due to regional differences in spelling
+conventions (e.g., color vs colour). We first explore the prevalence of
+American and British English spelling conventions in datasets used for the
+pre-training, training and evaluation of neural retrieval methods, and find
+that American spelling conventions are far more prevalent. Despite these biases
+in the training data, we find that retrieval models often generalise well in
+this case of synonymity. We explore the effect of document spelling
+normalisation in retrieval and observe that all models are affected by
+normalising the document's spelling. While they all experience a drop in
+performance when normalised to a different spelling convention than that of the
+query, we observe varied behaviour when the document is normalised to share the
+query spelling convention: lexical models show improvements, dense retrievers
+remain unaffected, and re-rankers exhibit contradictory behaviour.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 tables, short paper published in SIGIR '23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Query Reformulation for Effective Adhoc Search <span class="chip">SIGIR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Wang, Sean MacAvaney, Craig Macdonald, Iadh Ounis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performing automatic reformulations of a user's query is a popular paradigm
+used in information retrieval (IR) for improving effectiveness -- as
+exemplified by the pseudo-relevance feedback approaches, which expand the query
+in order to alleviate the vocabulary mismatch problem. Recent advancements in
+generative language models have demonstrated their ability in generating
+responses that are relevant to a given prompt. In light of this success, we
+seek to study the capacity of such models to perform query reformulation and
+how they compare with long-standing query reformulation methods that use
+pseudo-relevance feedback. In particular, we investigate two representative
+query reformulation frameworks, GenQR and GenPRF. GenQR directly reformulates
+the user's input query, while GenPRF provides additional context for the query
+by making use of pseudo-relevance feedback information. For each reformulation
+method, we leverage different techniques, including fine-tuning and direct
+prompting, to harness the knowledge of language models. The reformulated
+queries produced by the generative models are demonstrated to markedly benefit
+the effectiveness of a state-of-the-art retrieval pipeline on four TREC test
+collections (varying from TREC 2004 Robust to the TREC 2019 Deep Learning).
+Furthermore, our results indicate that our studied generative models can
+outperform various statistical query expansion approaches while remaining
+comparable to other existing complex neural query reformulation models, with
+the added benefit of being simpler to implement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Gen-IR@SIGIR2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Challenging the Myth of Graph Collaborative Filtering: a Reasoned and
+  Reproducibility-driven Analysis <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vito Walter Anelli, Daniele Malitesta, Claudio Pomo, Alejandro Bellogín, Tommaso Di Noia, Eugenio Di Sciascio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of graph neural network-based models (GNNs) has significantly
+advanced recommender systems by effectively modeling users and items as a
+bipartite, undirected graph. However, many original graph-based works often
+adopt results from baseline papers without verifying their validity for the
+specific configuration under analysis. Our work addresses this issue by
+focusing on the replicability of results. We present a code that successfully
+replicates results from six popular and recent graph recommendation models
+(NGCF, DGCF, LightGCN, SGL, UltraGCN, and GFCF) on three common benchmark
+datasets (Gowalla, Yelp 2018, and Amazon Book). Additionally, we compare these
+graph models with traditional collaborative filtering models that historically
+performed well in offline evaluations. Furthermore, we extend our study to two
+new datasets (Allrecipes and BookCrossing) that lack established setups in
+existing literature. As the performance on these datasets differs from the
+previous benchmarks, we analyze the impact of specific dataset characteristics
+on recommendation accuracy. By investigating the information flow from users'
+neighborhoods, we aim to identify which models are influenced by intrinsic
+features in the dataset structure. The code to reproduce our experiments is
+available at: https://github.com/sisinflab/Graph-RSs-Reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to RecSys '23 - Reproducility Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Contrastive <span class="highlight-title">BERT</span> Fine-tuning for Fusion-based
+  <span class="highlight-title">Review</span>ed-Item Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdi Abdollah Pour, Parsa Farinneya, Armin Toroghi, Anton Korikov, Ali Pesaranghader, Touqir Sajed, Manasa Bharadwaj, Borislav Mavrin, Scott Sanner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As natural language interfaces enable users to express increasingly complex
+natural language queries, there is a parallel explosion of user review content
+that can allow users to better find items such as restaurants, books, or movies
+that match these expressive queries. While Neural Information Retrieval (IR)
+methods have provided state-of-the-art results for matching queries to
+documents, they have not been extended to the task of Reviewed-Item Retrieval
+(RIR), where query-review scores must be aggregated (or fused) into item-level
+scores for ranking. In the absence of labeled RIR datasets, we extend Neural IR
+methodology to RIR by leveraging self-supervised methods for contrastive
+learning of BERT embeddings for both queries and reviews. Specifically,
+contrastive learning requires a choice of positive and negative samples, where
+the unique two-level structure of our item-review data combined with meta-data
+affords us a rich structure for the selection of these samples. For contrastive
+learning in a Late Fusion scenario, we investigate the use of positive review
+samples from the same item and/or with the same rating, selection of hard
+positive samples by choosing the least similar reviews from the same anchor
+item, and selection of hard negative samples by choosing the most similar
+reviews from different items. We also explore anchor sub-sampling and
+augmenting with meta-data. For a more end-to-end Early Fusion approach, we
+introduce contrastive item embedding learning to fuse reviews into single item
+embeddings. Experimental results show that Late Fusion contrastive learning for
+Neural RIR outperforms all other contrastive IR configurations, Neural IR, and
+sparse retrieval baselines, thus demonstrating the power of exploiting the
+two-level structure in Neural RIR approaches as well as the importance of
+preserving the nuance of individual review content via Late Fusion methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Knowledge-Oriented Approach to Enhance Integration and Communicability
+  in the Polkadot Ecosystem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcio Ferreira Moreno, Rafael Rossi de Mello Brandão
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Polkadot ecosystem is a disruptive and highly complex multi-chain
+architecture that poses challenges in terms of data analysis and
+communicability. Currently, there is a lack of standardized and holistic
+approaches to retrieve and analyze data across parachains and applications,
+making it difficult for general users and developers to access ecosystem data
+consistently. This paper proposes a conceptual framework that includes a domain
+ontology called POnto (a Polkadot Ontology) to address these challenges. POnto
+provides a structured representation of the ecosystem's concepts and
+relationships, enabling a formal understanding of the platform. The proposed
+knowledge-oriented approach enhances integration and communicability, enabling
+a wider range of users to participate in the ecosystem and facilitating the
+development of AI-based applications. The paper presents a case study
+methodology to validate the proposed framework, which includes expert feedback
+and insights from the Polkadot community. The POnto ontology and the roadmap
+for a query engine based on a Controlled Natural Language using the ontology,
+provide valuable contributions to the growth and adoption of the Polkadot
+ecosystem in heterogeneous socio-technical environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Collaborative Filtering with Personalized Time Decay Functions
+  for Financial Product Recommendation <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashraf Ghiye, Baptiste Barreau, Laurent Carlier, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical recommender systems often assume that historical data are
+stationary and fail to account for the dynamic nature of user preferences,
+limiting their ability to provide reliable recommendations in time-sensitive
+settings. This assumption is particularly problematic in finance, where
+financial products exhibit continuous changes in valuations, leading to
+frequent shifts in client interests. These evolving interests, summarized in
+the past client-product interactions, see their utility fade over time with a
+degree that might differ from one client to another. To address this challenge,
+we propose a time-dependent collaborative filtering algorithm that can
+adaptively discount distant client-product interactions using personalized
+decay functions. Our approach is designed to handle the non-stationarity of
+financial data and produce reliable recommendations by modeling the dynamic
+collaborative signals between clients and products. We evaluate our method
+using a proprietary dataset from BNP Paribas and demonstrate significant
+improvements over state-of-the-art benchmarks from relevant literature. Our
+findings emphasize the importance of incorporating time explicitly in the model
+to enhance the accuracy of financial product recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, 2 tables, to be published in the Seventeenth ACM
+  Conference on Recommender Systems (RecSys '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative filtering to capture AI user's preferences as norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serramia Marc, Criado Natalia, Luck Michael
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Customising AI technologies to each user's preferences is fundamental to them
+functioning well. Unfortunately, current methods require too much user
+involvement and fail to capture their true preferences. In fact, to avoid the
+nuisance of manually setting preferences, users usually accept the default
+settings even if these do not conform to their true preferences. Norms can be
+useful to regulate behaviour and ensure it adheres to user preferences but,
+while the literature has thoroughly studied norms, most proposals take a formal
+perspective. Indeed, while there has been some research on constructing norms
+to capture a user's privacy preferences, these methods rely on domain knowledge
+which, in the case of AI technologies, is difficult to obtain and maintain. We
+argue that a new perspective is required when constructing norms, which is to
+exploit the large amount of preference information readily available from whole
+systems of users. Inspired by recommender systems, we believe that
+collaborative filtering can offer a suitable approach to identifying a user's
+norm preferences without excessive user involvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 24th International Conference on Principles and
+  Practice of Multi-Agent Systems (PRIMA 2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Günther, Louis Milliken, Jonathan Geuter, Georgios Mastrapas, Bo Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jina Embeddings constitutes a set of high-performance sentence embedding
+models adept at translating various textual inputs into numerical
+representations, thereby capturing the semantic essence of the text. The models
+excel in applications such as dense retrieval and semantic textual similarity.
+This paper details the development of Jina Embeddings, starting with the
+creation of high-quality pairwise and triplet datasets. It underlines the
+crucial role of data cleaning in dataset preparation, gives in-depth insights
+into the model training process, and concludes with a comprehensive performance
+evaluation using the Massive Textual Embedding Benchmark (MTEB). To increase
+the model's awareness of negations, we constructed a novel training and
+evaluation dataset of negated and non-negated statements, which we make
+publicly available to the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 page appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Retrieval-Augmented Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00083v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00083v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ori Ram, Yoav Levine, Itay Dalmedigos, Dor Muhlgay, Amnon Shashua, Kevin Leyton-Brown, Yoav Shoham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Language Modeling (RALM) methods, which condition a
+language model (LM) on relevant documents from a grounding corpus during
+generation, were shown to significantly improve language modeling performance.
+In addition, they can mitigate the problem of factually inaccurate text
+generation and provide natural source attribution mechanism. Existing RALM
+approaches focus on modifying the LM architecture in order to facilitate the
+incorporation of external information, significantly complicating deployment.
+This paper considers a simple alternative, which we dub In-Context RALM:
+leaving the LM architecture unchanged and prepending grounding documents to the
+input, without any further training of the LM. We show that In-Context RALM
+that builds on off-the-shelf general purpose retrievers provides surprisingly
+large LM gains across model sizes and diverse corpora. We also demonstrate that
+the document retrieval and ranking mechanism can be specialized to the RALM
+setting to further boost performance. We conclude that In-Context RALM has
+considerable potential to increase the prevalence of LM grounding, particularly
+in settings where a pretrained LM must be used without modification or even via
+API access.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Transactions of the Association for
+  Computational Linguistics (TACL). pre-MIT Press publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">134</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Hypervectors: A <span class="highlight-title">Survey</span> on Hypervector Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sercan Aygun, Mehran Shoushtari Moghadam, M. Hassan Najafi, Mohsen Imani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperdimensional computing (HDC) is an emerging computing paradigm that
+imitates the brain's structure to offer a powerful and efficient processing and
+learning model. In HDC, the data are encoded with long vectors, called
+hypervectors, typically with a length of 1K to 10K. The literature provides
+several encoding techniques to generate orthogonal or correlated hypervectors,
+depending on the intended application. The existing surveys in the literature
+often focus on the overall aspects of HDC systems, including system inputs,
+primary computations, and final outputs. However, this study takes a more
+specific approach. It zeroes in on the HDC system input and the generation of
+hypervectors, directly influencing the hypervector encoding process. This
+survey brings together various methods for hypervector generation from
+different studies and explores the limitations, challenges, and potential
+benefits they entail. Through a comprehensive exploration of this survey,
+readers will acquire a profound understanding of various encoding types in HDC
+and gain insights into the intricate process of hypervector generation for
+diverse applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CodeBPE: Investigating Subtokenization Options for Large Language Model
+  <span class="highlight-title">Pretrain</span>ing on Source Code <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadezhda Chirkova, Sergey Troshin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works have widely adopted large language model pretraining for source
+code, suggested source code-specific pretraining objectives and investigated
+the applicability of various Transformer-based language model architectures for
+source code. This work investigates another important aspect of such models,
+namely the effect of different subtokenization options, and aims at identifying
+most effective and length-efficient subtokenizations, taking into account code
+specifics. We propose subtokenziation that reduces average length by 17%
+without downstream performance drop, and show that a carefully chosen
+subtokenization may improve quality by 0.5-2%, possibly with some length
+increase.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tool Documentation Enables Zero-Shot Tool-Usage with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-Yu Hsieh, Si-An Chen, Chun-Liang Li, Yasuhisa Fujii, Alexander Ratner, Chen-Yu Lee, Ranjay Krishna, Tomas Pfister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, large language models (LLMs) are taught to use new tools by providing
+a few demonstrations of the tool's usage. Unfortunately, demonstrations are
+hard to acquire, and can result in undesirable biased usage if the wrong
+demonstration is chosen. Even in the rare scenario that demonstrations are
+readily available, there is no principled selection protocol to determine how
+many and which ones to provide. As tasks grow more complex, the selection
+search grows combinatorially and invariably becomes intractable. Our work
+provides an alternative to demonstrations: tool documentation. We advocate the
+use of tool documentation, descriptions for the individual tool usage, over
+demonstrations. We substantiate our claim through three main empirical findings
+on 6 tasks across both vision and language modalities. First, on existing
+benchmarks, zero-shot prompts with only tool documentation are sufficient for
+eliciting proper tool usage, achieving performance on par with few-shot
+prompts. Second, on a newly collected realistic tool-use dataset with hundreds
+of available tool APIs, we show that tool documentation is significantly more
+valuable than demonstrations, with zero-shot documentation significantly
+outperforming few-shot without documentation. Third, we highlight the benefits
+of tool documentations by tackling image generation and video tracking using
+just-released unseen state-of-the-art models as tools. Finally, we highlight
+the possibility of using tool documentation to automatically enable new
+applications: by using nothing more than the documentation of GroundingDino,
+Stable Diffusion, XMem, and SAM, LLMs can re-invent the functionalities of the
+just-released Grounded-SAM and Track Anything models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hessian-Aware Bayesian Optimization for Decision Making Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohit Rajpal, Lac Gia Tran, Yehong Zhang, Bryan Kian Hsiang Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many approaches for optimizing decision making systems rely on gradient based
+methods requiring informative feedback from the environment. However, in the
+case where such feedback is sparse or uninformative, such approaches may result
+in poor performance. Derivative-free approaches such as Bayesian Optimization
+mitigate the dependency on the quality of gradient feedback, but are known to
+scale poorly in the high-dimension setting of complex decision making systems.
+This problem is exacerbated if the system requires interactions between several
+actors cooperating to accomplish a shared goal. To address the dimensionality
+challenge, we propose a compact multi-layered architecture modeling the
+dynamics of actor interactions through the concept of role. Additionally, we
+introduce Hessian-aware Bayesian Optimization to efficiently optimize the
+multi-layered architecture parameterized by a large number of parameters.
+Experimental results demonstrate that our method (HA-GP-UCB) works effectively
+on several benchmarks under resource constraints and malformed feedback
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-M3: A Multi-view Multi-modal <span class="highlight-title">Dataset</span> for 3D Human Pose Estimation
+  in Outdoor Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Fan, Siqi Wang, Wenzhao Zheng, Jianjiang Feng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D human pose estimation in outdoor environments has garnered increasing
+attention recently. However, prevalent 3D human pose datasets pertaining to
+outdoor scenes lack diversity, as they predominantly utilize only one type of
+modality (RGB image or pointcloud), and often feature only one individual
+within each scene. This limited scope of dataset infrastructure considerably
+hinders the variability of available data. In this article, we propose
+Human-M3, an outdoor multi-modal multi-view multi-person human pose database
+which includes not only multi-view RGB videos of outdoor scenes but also
+corresponding pointclouds. In order to obtain accurate human poses, we propose
+an algorithm based on multi-modal data input to generate ground truth
+annotation. This benefits from robust pointcloud detection and tracking, which
+solves the problem of inaccurate human localization and matching ambiguity that
+may exist in previous multi-view RGB videos in outdoor multi-person scenes, and
+generates reliable ground truth annotations. Evaluation of multiple different
+modalities algorithms has shown that this database is challenging and suitable
+for future research. Furthermore, we propose a 3D human pose estimation
+algorithm based on multi-modal data input, which demonstrates the advantages of
+multi-modal data input for 3D human pose estimation. Code and data will be
+released on https://github.com/soullessrobot/Human-M3-Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data will be released on
+  https://github.com/soullessrobot/Human-M3-Dataset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond One-Hot-Encoding: Injecting Semantics to Drive Image Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan Perotti, Simone Bertolotto, Eliana Pastor, André Panisson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images are loaded with semantic information that pertains to real-world
+ontologies: dog breeds share mammalian similarities, food pictures are often
+depicted in domestic environments, and so on. However, when training machine
+learning models for image classification, the relative similarities amongst
+object classes are commonly paired with one-hot-encoded labels. According to
+this logic, if an image is labelled as 'spoon', then 'tea-spoon' and 'shark'
+are equally wrong in terms of training loss. To overcome this limitation, we
+explore the integration of additional goals that reflect ontological and
+semantic knowledge, improving model interpretability and trustworthiness. We
+suggest a generic approach that allows to derive an additional loss term
+starting from any kind of semantic information about the classification label.
+First, we show how to apply our approach to ontologies and word embeddings, and
+discuss how the resulting information can drive a supervised learning process.
+Second, we use our semantically enriched loss to train image classifiers, and
+analyse the trade-offs between accuracy, mistake severity, and learned internal
+representations. Finally, we discuss how this approach can be further exploited
+in terms of explainability and adversarial robustness. Code repository:
+https://github.com/S1M0N38/semantic-encodings
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted to be presented to The 1st World
+  Conference on eXplainable Artificial Intelligence (xAI 2023), July 26-28,
+  2023 - Lisboa, Portugal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semisupervised Anomaly Detection using Support Vector Regression with
+  Quantum Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kilian Tscharke, Sebastian Issel, Pascal Debus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection (AD) involves identifying observations or events that
+deviate in some way from the rest of the data. Machine learning techniques have
+shown success in automating this process by detecting hidden patterns and
+deviations in large-scale data. The potential of quantum computing for machine
+learning has been widely recognized, leading to extensive research efforts to
+develop suitable quantum machine learning (QML) algorithms. In particular, the
+search for QML algorithms for near-term NISQ devices is in full swing. However,
+NISQ devices pose additional challenges due to their limited qubit coherence
+times, low number of qubits, and high error rates. Kernel methods based on
+quantum kernel estimation have emerged as a promising approach to QML on NISQ
+devices, offering theoretical guarantees, versatility, and compatibility with
+NISQ constraints. Especially support vector machines (SVM) utilizing quantum
+kernel estimation have shown success in various supervised learning tasks.
+However, in the context of AD, semisupervised learning is of great relevance,
+and yet there is limited research published in this area. This paper introduces
+an approach to semisupervised AD based on the reconstruction loss of a support
+vector regression (SVR) with quantum kernel. This novel model is an alternative
+to the variational quantum and quantum kernel one-class classifiers, and is
+compared to a quantum autoencoder as quantum baseline and a SVR with
+radial-basis-function (RBF) kernel as well as a classical autoencoder as
+classical baselines. The models are benchmarked extensively on 10 real-world AD
+data sets and one toy data set, and it is shown that our SVR model with quantum
+kernel performs better than the SVR with RBF kernel as well as all other
+models, achieving highest mean AUC over all data sets. In addition, our QSVR
+outperforms the quantum autoencoder on 9 out of 11 data sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE International Conference on Quantum Computing and
+  Engineering (QCE) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Linear Regression: Phase-Transitions and Precise Tradeoffs for
+  General Norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00556v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00556v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elvis Dohmatob, Meyer Scetbon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the impact of test-time adversarial attacks on
+linear regression models and determine the optimal level of robustness that any
+model can reach while maintaining a given level of standard predictive
+performance (accuracy). Through quantitative estimates, we uncover fundamental
+tradeoffs between adversarial robustness and accuracy in different regimes. We
+obtain a precise characterization which distinguishes between regimes where
+robustness is achievable without hurting standard accuracy and regimes where a
+tradeoff might be unavoidable. Our findings are empirically confirmed with
+simple experiments that represent a variety of settings. This work applies to
+feature covariance matrices and attack norms of any nature, and extends beyond
+previous works in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Copula for Instance-wise Feature Selection and Ranking <span class="chip">UAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyu Peng, Guanhua Fang, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instance-wise feature selection and ranking methods can achieve a good
+selection of task-friendly features for each sample in the context of neural
+networks. However, existing approaches that assume feature subsets to be
+independent are imperfect when considering the dependency between features. To
+address this limitation, we propose to incorporate the Gaussian copula, a
+powerful mathematical technique for capturing correlations between variables,
+into the current feature selection framework with no additional changes needed.
+Experimental results on both synthetic and real datasets, in terms of
+performance comparison and interpretability, demonstrate that our method is
+capable of capturing meaningful correlations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, UAI poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Early Dropouts of an Active and Healthy Ageing App 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vasileios Perifanis, Ioanna Michailidi, Giorgos Stamatelatos, George Drosatos, Pavlos S. Efraimidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present a machine learning approach for predicting early
+dropouts of an active and healthy ageing app. The presented algorithms have
+been submitted to the IFMBE Scientific Challenge 2022, part of IUPESM WC 2022.
+We have processed the given database and generated seven datasets. We used
+pre-processing techniques to construct classification models that predict the
+adherence of users using dynamic and static features. We submitted 11 official
+runs and our results show that machine learning algorithms can provide
+high-quality adherence predictions. Based on the results, the dynamic features
+positively influence a model's classification performance. Due to the
+imbalanced nature of the dataset, we employed oversampling methods such as
+SMOTE and ADASYN to improve the classification performance. The oversampling
+approaches led to a remarkable improvement of 10\%. Our methods won first place
+in the IFMBE Scientific Challenge 2022.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Embedding Dynamic Feature-based Supervised Contrastive Learning of
+  Transient Stability for Changing Power Grid Topologies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Lv, Xin Chen, Zijian Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate online transient stability prediction is critical for ensuring power
+system stability when facing disturbances. While traditional transient stablity
+analysis replies on the time domain simulations can not be quickly adapted to
+the power grid toplogy change. In order to vectorize high-dimensional power
+grid topological structure information into low-dimensional node-based graph
+embedding streaming data, graph embedding dynamic feature (GEDF) has been
+proposed. The transient stability GEDF-based supervised contrastive learning
+(GEDF-SCL) model uses supervised contrastive learning to predict transient
+stability with GEDFs, considering power grid topology information. To evaluate
+the performance of the proposed GEDF-SCL model, power grids of varying
+topologies were generated based on the IEEE 39-bus system model. Transient
+operational data was obtained by simulating N-1 and N-$\bm{m}$-1 contingencies
+on these generated power system topologies. Test result demonstrated that the
+GEDF-SCL model can achieve high accuracy in transient stability prediction and
+adapt well to changing power grid topologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE Transactions on Power
+  Systems for possible publication. Copyright may be transferred without
+  notice, after which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Contrastive Learning with Generative Adversarial Network <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Wu, Chaokun Wang, Jingcao Xu, Ziyang Liu, Kai Zheng, Xiaowei Wang, Yang Song, Kun Gai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have demonstrated promising results on
+exploiting node representations for many downstream tasks through supervised
+end-to-end training. To deal with the widespread label scarcity issue in
+real-world applications, Graph Contrastive Learning (GCL) is leveraged to train
+GNNs with limited or even no labels by maximizing the mutual information
+between nodes in its augmented views generated from the original graph.
+However, the distribution of graphs remains unconsidered in view generation,
+resulting in the ignorance of unseen edges in most existing literature, which
+is empirically shown to be able to improve GCL's performance in our
+experiments. To this end, we propose to incorporate graph generative
+adversarial networks (GANs) to learn the distribution of views for GCL, in
+order to i) automatically capture the characteristic of graphs for
+augmentations, and ii) jointly train the graph GAN model and the GCL model.
+Specifically, we present GACN, a novel Generative Adversarial Contrastive
+learning Network for graph representation learning. GACN develops a view
+generator and a view discriminator to generate augmented views automatically in
+an adversarial style. Then, GACN leverages these views to train a GNN encoder
+with two carefully designed self-supervised learning losses, including the
+graph contrastive loss and the Bayesian personalized ranking Loss. Furthermore,
+we design an optimization framework to train all GACN modules jointly.
+Extensive experiments on seven real-world datasets show that GACN is able to
+generate high-quality augmented views for GCL and is superior to twelve
+state-of-the-art baseline methods. Noticeably, our proposed GACN surprisingly
+discovers that the generated views in data augmentation finally conform to the
+well-known preferential attachment rule in online networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Temporal Multi-Gate Mixture-of-Experts Approach for Vehicle
+  Trajectory and Driving Intention Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renteng Yuan, Mohamed Abdel-Aty, Qiaojun Xiang, Zijin Wang, Ou Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate Vehicle Trajectory Prediction is critical for automated vehicles and
+advanced driver assistance systems. Vehicle trajectory prediction consists of
+two essential tasks, i.e., longitudinal position prediction and lateral
+position prediction. There is a significant correlation between driving
+intentions and vehicle motion. In existing work, the three tasks are often
+conducted separately without considering the relationships between the
+longitudinal position, lateral position, and driving intention. In this paper,
+we propose a novel Temporal Multi-Gate Mixture-of-Experts (TMMOE) model for
+simultaneously predicting the vehicle trajectory and driving intention. The
+proposed model consists of three layers: a shared layer, an expert layer, and a
+fully connected layer. In the model, the shared layer utilizes Temporal
+Convolutional Networks (TCN) to extract temporal features. Then the expert
+layer is built to identify different information according to the three tasks.
+Moreover, the fully connected layer is used to integrate and export prediction
+results. To achieve better performance, uncertainty algorithm is used to
+construct the multi-task loss function. Finally, the publicly available CitySim
+dataset validates the TMMOE model, demonstrating superior performance compared
+to the LSTM model, achieving the highest classification and regression results.
+Keywords: Vehicle trajectory prediction, driving intentions Classification,
+Multi-task
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Label-Correlation Enhancement for Congestion Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Biao Liu, Congyu Qiao, Ning Xu, Xin Geng, Ziran Zhu, Jun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The physical design process of large-scale designs is a time-consuming task,
+often requiring hours to days to complete, with routing being the most critical
+and complex step. As the the complexity of Integrated Circuits (ICs) increases,
+there is an increased demand for accurate routing quality prediction. Accurate
+congestion prediction aids in identifying design flaws early on, thereby
+accelerating circuit design and conserving resources. Despite the advancements
+in current congestion prediction methodologies, an essential aspect that has
+been largely overlooked is the spatial label-correlation between different
+grids in congestion prediction. The spatial label-correlation is a fundamental
+characteristic of circuit design, where the congestion status of a grid is not
+isolated but inherently influenced by the conditions of its neighboring grids.
+In order to fully exploit the inherent spatial label-correlation between
+neighboring grids, we propose a novel approach, {\ours}, i.e., VAriational
+Label-Correlation Enhancement for Congestion Prediction, which considers the
+local label-correlation in the congestion map, associating the estimated
+congestion value of each grid with a local label-correlation weight influenced
+by its surrounding grids. {\ours} leverages variational inference techniques to
+estimate this weight, thereby enhancing the regression model's performance by
+incorporating spatial dependencies. Experiment results validate the superior
+effectiveness of {\ours} on the public available \texttt{ISPD2011} and
+\texttt{DAC2012} benchmarks using the superblue circuit line.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT
+  by Integrating Neural Distance and Texture-Aware <span class="highlight-title">Transformer</span> <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hexin Dong, Jiawen Yao, Yuxing Tang, Mingze Yuan, Yingda Xia, Jian Zhou, Hong Lu, Jingren Zhou, Bin Dong, Le Lu, Li Zhang, Zaiyi Liu, Yu Shi, Ling Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which
+the tumor-vascular involvement greatly affects the resectability and, thus,
+overall survival of patients. However, current prognostic prediction methods
+fail to explicitly and accurately investigate relationships between the tumor
+and nearby important vessels. This paper proposes a novel learnable neural
+distance that describes the precise relationship between the tumor and vessels
+in CT images of different patients, adopting it as a major feature for
+prognosis prediction. Besides, different from existing models that used CNNs or
+LSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT
+imaging, we improved the extraction of dynamic tumor-related texture features
+in multi-phase contrast-enhanced CT by fusing local and global features using
+CNN and transformer modules, further enhancing the features extracted across
+multi-phase CT images. We extensively evaluated and compared the proposed
+method with existing methods in the multi-center (n=4) dataset with 1,070
+patients with PDAC, and statistical analysis confirmed its clinical
+effectiveness in the external test set consisting of three centers. The
+developed risk marker was the strongest predictor of overall survival among
+preoperative factors and it has the potential to be combined with established
+clinical factors to select patients at higher risk who might benefit from
+neoadjuvant therapy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Graph Spectral Clustering of Text Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bartłomiej Starosta, Mieczysław A. Kłopotek, Sławomir T. Wierzchoń
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectral clustering methods are known for their ability to represent clusters
+of diverse shapes, densities etc. However, results of such algorithms, when
+applied e.g. to text documents, are hard to explain to the user, especially due
+to embedding in the spectral space which has no obvious relation to document
+contents. Therefore there is an urgent need to elaborate methods for explaining
+the outcome of the clustering. This paper presents a contribution towards this
+goal. We present a proposal of explanation of results of combinatorial
+Laplacian based graph spectral clustering. It is based on showing (approximate)
+equivalence of combinatorial Laplacian embedding, $K$-embedding (proposed in
+this paper) and term vector space embedding. Hence a bridge is constructed
+between the textual contents and the clustering results. We provide theoretical
+background for this approach. We performed experimental study showing that
+$K$-embedding approximates well Laplacian embedding under favourable block
+matrix conditions and show that approximation is good enough under other
+conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 figures, 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DINO-CXR: A self supervised method based on vision <span class="highlight-title">transformer</span> for chest
+  X-ray classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Shakouri, Fatemeh Iranmanesh, Mahdi Eftekhari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The limited availability of labeled chest X-ray datasets is a significant
+bottleneck in the development of medical imaging methods. Self-supervised
+learning (SSL) can mitigate this problem by training models on unlabeled data.
+Furthermore, self-supervised pretraining has yielded promising results in
+visual recognition of natural images but has not been given much consideration
+in medical image analysis. In this work, we propose a self-supervised method,
+DINO-CXR, which is a novel adaptation of a self-supervised method, DINO, based
+on a vision transformer for chest X-ray classification. A comparative analysis
+is performed to show the effectiveness of the proposed method for both
+pneumonia and COVID-19 detection. Through a quantitative analysis, it is also
+shown that the proposed method outperforms state-of-the-art methods in terms of
+accuracy and achieves comparable results in terms of AUC and F-1 score while
+requiring significantly less labeled data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Last Layer Re-Training Truly Sufficient for Robustness to Spurious
+  Correlations? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00473v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00473v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phuong Quynh Le, Jörg Schlötterer, Christin Seifert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models trained with empirical risk minimization (ERM) are known to learn to
+rely on spurious features, i.e., their prediction is based on undesired
+auxiliary features which are strongly correlated with class labels but lack
+causal reasoning. This behavior particularly degrades accuracy in groups of
+samples of the correlated class that are missing the spurious feature or
+samples of the opposite class but with the spurious feature present. The
+recently proposed Deep Feature Reweighting (DFR) method improves accuracy of
+these worst groups. Based on the main argument that ERM mods can learn core
+features sufficiently well, DFR only needs to retrain the last layer of the
+classification model with a small group-balanced data set. In this work, we
+examine the applicability of DFR to realistic data in the medical domain.
+Furthermore, we investigate the reasoning behind the effectiveness of
+last-layer retraining and show that even though DFR has the potential to
+improve the accuracy of the worst group, it remains susceptible to spurious
+correlations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mirror Natural Evolution Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haishan Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The zeroth-order optimization has been widely used in machine learning
+applications. However, the theoretical study of the zeroth-order optimization
+focus on the algorithms which approximate (first-order) gradients using
+(zeroth-order) function value difference at a random direction. The theory of
+algorithms which approximate the gradient and Hessian information by
+zeroth-order queries is much less studied. In this paper, we focus on the
+theory of zeroth-order optimization which utilizes both the first-order and
+second-order information approximated by the zeroth-order queries. We first
+propose a novel reparameterized objective function with parameters $(\mu,
+\Sigma)$. This reparameterized objective function achieves its optimum at the
+minimizer and the Hessian inverse of the original objective function
+respectively, but with small perturbations. Accordingly, we propose a new
+algorithm to minimize our proposed reparameterized objective, which we call
+\texttt{MiNES} (mirror descent natural evolution strategy). We show that the
+estimated covariance matrix of \texttt{MiNES} converges to the inverse of
+Hessian matrix of the objective function with a convergence rate
+$\widetilde{\mathcal{O}}(1/k)$, where $k$ is the iteration number and
+$\widetilde{\mathcal{O}}(\cdot)$ hides the constant and $\log$ terms. We also
+provide the explicit convergence rate of \texttt{MiNES} and how the covariance
+matrix promotes the convergence rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:1910.11490</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Majority Invariant Approach to Patch Robustness Certification for Deep
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qilin Zhou, Zhengyuan Wei, Haipeng Wang, W. K. Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Patch robustness certification ensures no patch within a given bound on a
+sample can manipulate a deep learning model to predict a different label.
+However, existing techniques cannot certify samples that cannot meet their
+strict bars at the classifier or patch region levels. This paper proposes
+MajorCert. MajorCert firstly finds all possible label sets manipulatable by the
+same patch region on the same sample across the underlying classifiers, then
+enumerates their combinations element-wise, and finally checks whether the
+majority invariant of all these combinations is intact to certify samples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelfCheck: Using LLMs to Zero-Shot Check Their Own Step-by-Step
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ning Miao, Yee Whye Teh, Tom Rainforth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent progress in large language models (LLMs), especially the invention
+of chain-of-thoughts (CoT) prompting, makes it possible to solve reasoning
+problems. However, even the strongest LLMs are still struggling with more
+complicated problems that require non-linear thinking and multi-step reasoning.
+In this work, we explore whether LLMs have the ability to recognize their own
+errors, without resorting to external resources. In particular, we investigate
+whether they can be used to identify individual errors within a step-by-step
+reasoning. To this end, we propose a zero-shot verification scheme to recognize
+such errors. We then use this verification scheme to improve question-answering
+performance, by using it to perform weighted voting on different generated
+answers. We test the method on three math datasets-GSM8K, MathQA, and MATH-and
+find that it successfully recognizes errors and, in turn, increases final
+predictive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling Hallucinations in Neural Chart Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00399v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00399v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saad Obaid ul Islam, Iza Škrjanec, Ondřej Dušek, Vera Demberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucinations in text generation occur when the system produces text that is
+not grounded in the input. In this work, we tackle the problem of
+hallucinations in neural chart summarization. Our analysis shows that the
+target side of chart summarization training datasets often contains additional
+information, leading to hallucinations. We propose a natural language inference
+(NLI) based method to preprocess the training data and show through human
+evaluation that our method significantly reduces hallucinations. We also found
+that shortening long-distance dependencies in the input sequence and adding
+chart-related information like title and legends improves the overall
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented in INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Time Series Anomaly Detection Methods in the AIOps Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00393v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00393v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyu Zhong, Qiliang Fan, Jiacheng Zhang, Minghua Ma, Shenglin Zhang, Yongqian Sun, Qingwei Lin, Yuzhi Zhang, Dan Pei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Internet-based services have seen remarkable success, generating vast amounts
+of monitored key performance indicators (KPIs) as univariate or multivariate
+time series. Monitoring and analyzing these time series are crucial for
+researchers, service operators, and on-call engineers to detect outliers or
+anomalies indicating service failures or significant events. Numerous advanced
+anomaly detection methods have emerged to address availability and performance
+issues. This review offers a comprehensive overview of time series anomaly
+detection in Artificial Intelligence for IT operations (AIOps), which uses AI
+capabilities to automate and optimize operational workflows. Additionally, it
+explores future directions for real-world and next-generation time-series
+anomaly detection based on recent advancements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Graph <span class="highlight-title">Transformer</span> for Traffic Flow Prediction <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Yang, Kai Du, Xingyuan Dai, Jianwu Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic flow prediction (TFP) is a fundamental problem of the Intelligent
+Transportation System (ITS), as it models the latent spatial-temporal
+dependency of traffic flow for potential congestion prediction. Recent
+graph-based models with multiple kinds of attention mechanisms have achieved
+promising performance. However, existing methods for traffic flow prediction
+tend to inherit the bias pattern from the dataset and lack interpretability. To
+this end, we propose a Counterfactual Graph Transformer (CGT) model with an
+instance-level explainer (e.g., finding the important subgraphs) specifically
+designed for TFP. We design a perturbation mask generator over input sensor
+features at the time dimension and the graph structure on the graph transformer
+module to obtain spatial and temporal counterfactual explanations. By searching
+the optimal perturbation masks on the input data feature and graph structures,
+we can obtain the concise and dominant data or graph edge links for the
+subsequent TFP task. After re-training the utilized graph transformer model
+after counterfactual perturbation, we can obtain improved and interpretable
+traffic flow prediction. Extensive results on three real-world public datasets
+show that CGT can produce reliable explanations and is promising for traffic
+flow prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape Completion with Prediction of Uncertain Regions <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Humt, Dominik Winkelbauer, Ulrich Hillenbrand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape completion, i.e., predicting the complete geometry of an object from a
+partial observation, is highly relevant for several downstream tasks, most
+notably robotic manipulation. When basing planning or prediction of real grasps
+on object shape reconstruction, an indication of severe geometric uncertainty
+is indispensable. In particular, there can be an irreducible uncertainty in
+extended regions about the presence of entire object parts when given ambiguous
+object views. To treat this important case, we propose two novel methods for
+predicting such uncertain regions as straightforward extensions of any method
+for predicting local spatial occupancy, one through postprocessing occupancy
+scores, the other through direct prediction of an uncertainty indicator. We
+compare these methods together with two known approaches to probabilistic shape
+completion. Moreover, we generate a dataset, derived from ShapeNet, of
+realistically rendered depth images of object views with ground-truth
+annotations for the uncertain regions. We train on this dataset and test each
+method in shape completion and prediction of uncertain regions for known and
+novel object instances and on synthetic and real data. While direct uncertainty
+prediction is by far the most accurate in the segmentation of uncertain
+regions, both novel methods outperform the two baselines in shape completion
+and uncertain region prediction, and avoiding the predicted uncertain regions
+increases the quality of grasps for all tested methods. Web:
+https://github.com/DLR-RM/shape-completion
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures, 2023 IEEE/RSJ International Conference on
+  Intelligent Robots and Systems, IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Green's Function Efficiently Using Low-Rank Approximations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00350v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00350v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kishan Wimalawarne, Taiji Suzuki, Sophie Langer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the Green's function using deep learning models enables to solve
+different classes of partial differential equations. A practical limitation of
+using deep learning for the Green's function is the repeated computationally
+expensive Monte-Carlo integral approximations. We propose to learn the Green's
+function by low-rank decomposition, which results in a novel architecture to
+remove redundant computations by separate learning with domain data for
+evaluation and Monte-Carlo samples for integral approximation. Using
+experiments we show that the proposed method improves computational time
+compared to MOD-Net while achieving comparable accuracy compared to both PINNs
+and MOD-Net.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic ensemble selection based on Deep Neural Network Uncertainty
+  Estimation for Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoxi Qin, Linyuan Wang, Xuehui Du, Xingyuan Chen, Bin Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deep neural network has attained significant efficiency in image
+recognition. However, it has vulnerable recognition robustness under extensive
+data uncertainty in practical applications. The uncertainty is attributed to
+the inevitable ambient noise and, more importantly, the possible adversarial
+attack. Dynamic methods can effectively improve the defense initiative in the
+arms race of attack and defense of adversarial examples. Different from the
+previous dynamic method depend on input or decision, this work explore the
+dynamic attributes in model level through dynamic ensemble selection technology
+to further protect the model from white-box attacks and improve the robustness.
+Specifically, in training phase the Dirichlet distribution is apply as prior of
+sub-models' predictive distribution, and the diversity constraint in parameter
+space is introduced under the lightweight sub-models to construct alternative
+ensembel model spaces. In test phase, the certain sub-models are dynamically
+selected based on their rank of uncertainty value for the final prediction to
+ensure the majority accurate principle in ensemble robustness and accuracy.
+Compared with the previous dynamic method and staic adversarial traning model,
+the presented approach can achieve significant robustness results without
+damaging accuracy by combining dynamics and diversity property.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monitoring Algorithmic Fairness under Partial Observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas A. Henzinger, Konstantin Kueffner, Kaushik Mallik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI and machine-learned software are used increasingly for making decisions
+that affect humans, it is imperative that they remain fair and unbiased in
+their decisions. To complement design-time bias mitigation measures, runtime
+verification techniques have been introduced recently to monitor the
+algorithmic fairness of deployed systems. Previous monitoring techniques assume
+full observability of the states of the (unknown) monitored system. Moreover,
+they can monitor only fairness properties that are specified as arithmetic
+expressions over the probabilities of different events. In this work, we extend
+fairness monitoring to systems modeled as partially observed Markov chains
+(POMC), and to specifications containing arithmetic expressions over the
+expected values of numerical functions on event sequences. The only assumptions
+we make are that the underlying POMC is aperiodic and starts in the stationary
+distribution, with a bound on its mixing time being known. These assumptions
+enable us to estimate a given property for the entire distribution of possible
+executions of the monitored POMC, by observing only a single execution. Our
+monitors observe a long run of the system and, after each new observation,
+output updated PAC-estimates of how fair or biased the system is. The monitors
+are computationally lightweight and, using a prototype implementation, we
+demonstrate their effectiveness on several real-world examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The extended version of the paper, with the same title, published in
+  23rd International Conference on Runtime Verification (RV'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Threshold-aware Learning to Generate Feasible Solutions for Mixed
+  Integer Programs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taehyun Yoon, Jinwon Choi, Hyokun Yun, Sungbin Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding a high-quality feasible solution to a combinatorial optimization (CO)
+problem in a limited time is challenging due to its discrete nature. Recently,
+there has been an increasing number of machine learning (ML) methods for
+addressing CO problems. Neural diving (ND) is one of the learning-based
+approaches to generating partial discrete variable assignments in Mixed Integer
+Programs (MIP), a framework for modeling CO problems. However, a major drawback
+of ND is a large discrepancy between the ML and MIP objectives, i.e., variable
+value classification accuracy over primal bound. Our study investigates that a
+specific range of variable assignment rates (coverage) yields high-quality
+feasible solutions, where we suggest optimizing the coverage bridges the gap
+between the learning and MIP objectives. Consequently, we introduce a post-hoc
+method and a learning-based approach for optimizing the coverage. A key idea of
+our approach is to jointly learn to restrict the coverage search space and to
+predict the coverage in the learned search space. Experimental results
+demonstrate that learning a deep neural network to estimate the coverage for
+finding high-quality feasible solutions achieves state-of-the-art performance
+in NeurIPS ML4CO datasets. In particular, our method shows outstanding
+performance in the workload apportionment dataset, achieving the optimality gap
+of 0.45%, a ten-fold improvement over SCIP within the one-minute time limit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pixel to policy: DQN Encoders for within & cross-game reinforcement
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashrya Agrawal, Priyanshi Shah, Sourabh Prakash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning can be applied to various tasks, and environments.
+Many of these environments have a similar shared structure, which can be
+exploited to improve RL performance on other tasks. Transfer learning can be
+used to take advantage of this shared structure, by learning policies that are
+transferable across different tasks and environments and can lead to more
+efficient learning as well as improved performance on a wide range of tasks.
+This work explores as well as compares the performance between RL models being
+trained from the scratch and on different approaches of transfer learning.
+Additionally, the study explores the performance of a model trained on multiple
+game environments, with the goal of developing a universal game-playing agent
+as well as transfer learning a pre-trained encoder using DQN, and training it
+on the same game or a different game. Our DQN model achieves a mean episode
+reward of 46.16 which even beats the human-level performance with merely 20k
+episodes which is significantly lower than deepmind's 1M episodes. The achieved
+mean rewards of 533.42 and 402.17 on the Assault and Space Invader environments
+respectively, represent noteworthy performance on these challenging
+environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Doubly Robust Instance-Reweighted Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daouda Sow, Sen Lin, Zhangyang Wang, Yingbin Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assigning importance weights to adversarial data has achieved great success
+in training adversarially robust networks under limited model capacity.
+However, existing instance-reweighted adversarial training (AT) methods heavily
+depend on heuristics and/or geometric interpretations to determine those
+importance weights, making these algorithms lack rigorous theoretical
+justification/guarantee. Moreover, recent research has shown that adversarial
+training suffers from a severe non-uniform robust performance across the
+training distribution, e.g., data points belonging to some classes can be much
+more vulnerable to adversarial attacks than others. To address both issues, in
+this paper, we propose a novel doubly-robust instance reweighted AT framework,
+which allows to obtain the importance weights via exploring distributionally
+robust optimization (DRO) techniques, and at the same time boosts the
+robustness on the most vulnerable examples. In particular, our importance
+weights are obtained by optimizing the KL-divergence regularized loss function,
+which allows us to devise new algorithms with a theoretical convergence
+guarantee. Experiments on standard classification datasets demonstrate that our
+proposed approach outperforms related state-of-the-art baseline methods in
+terms of average robust performance, and at the same time improves the
+robustness against attacks on the weakest data points. Codes will be available
+soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted for publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GradOrth: A Simple yet Efficient Out-of-Distribution Detection with
+  Orthogonal Projection of Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sima Behpour, Thang Doan, Xin Li, Wenbin He, Liang Gou, Liu Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-distribution (OOD) data is crucial for ensuring the safe
+deployment of machine learning models in real-world applications. However,
+existing OOD detection approaches primarily rely on the feature maps or the
+full gradient space information to derive OOD scores neglecting the role of
+most important parameters of the pre-trained network over in-distribution (ID)
+data. In this study, we propose a novel approach called GradOrth to facilitate
+OOD detection based on one intriguing observation that the important features
+to identify OOD data lie in the lower-rank subspace of in-distribution (ID)
+data. In particular, we identify OOD data by computing the norm of gradient
+projection on the subspaces considered important for the in-distribution data.
+A large orthogonal projection value (i.e. a small projection value) indicates
+the sample as OOD as it captures a weak correlation of the ID data. This simple
+yet effective method exhibits outstanding performance, showcasing a notable
+reduction in the average false positive rate at a 95% true positive rate
+(FPR95) of up to 8% when compared to the current state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Study of Unsupervised Evaluation Metrics for Practical and Automatic
+  Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Chen, Zepeng Gao, Shuai Zhao, Qibo Qiu, Wenxiao Wang, Binbin Lin, Xiaofei He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) methods facilitate the transfer of
+models to target domains without labels. However, these methods necessitate a
+labeled target validation set for hyper-parameter tuning and model selection.
+In this paper, we aim to find an evaluation metric capable of assessing the
+quality of a transferred model without access to target validation labels. We
+begin with the metric based on mutual information of the model prediction.
+Through empirical analysis, we identify three prevalent issues with this
+metric: 1) It does not account for the source structure. 2) It can be easily
+attacked. 3) It fails to detect negative transfer caused by the over-alignment
+of source and target features. To address the first two issues, we incorporate
+source accuracy into the metric and employ a new MLP classifier that is held
+out during training, significantly improving the result. To tackle the final
+issue, we integrate this enhanced metric with data augmentation, resulting in a
+novel unsupervised UDA metric called the Augmentation Consistency Metric (ACM).
+Additionally, we empirically demonstrate the shortcomings of previous
+experiment settings and conduct large-scale experiments to validate the
+effectiveness of our proposed metric. Furthermore, we employ our metric to
+automatically search for the optimal hyper-parameter set, achieving superior
+performance compared to manually tuned sets across four common benchmarks.
+Codes will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predictive Modeling through Hyper-Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manisha Senadeera, Santu Rana, Sunil Gupta, Svetha Venkatesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model selection is an integral problem of model based optimization techniques
+such as Bayesian optimization (BO). Current approaches often treat model
+selection as an estimation problem, to be periodically updated with
+observations coming from the optimization iterations. In this paper, we propose
+an alternative way to achieve both efficiently. Specifically, we propose a
+novel way of integrating model selection and BO for the single goal of reaching
+the function optima faster. The algorithm moves back and forth between BO in
+the model space and BO in the function space, where the goodness of the
+recommended model is captured by a score function and fed back, capturing how
+well the model helped convergence in the function space. The score function is
+derived in such a way that it neutralizes the effect of the moving nature of
+the BO in the function space, thus keeping the model selection problem
+stationary. This back and forth leads to quick convergence for both model
+selection and BO in the function space. In addition to improved sample
+efficiency, the framework outputs information about the black-box function.
+Convergence is proved, and experimental results show significant improvement
+compared to standard BO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLAMS: A Cluster Ambiguity Measure for Estimating Perceptual Variability
+  in Visual Clustering <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeon Jeon, Ghulam Jilani Quadri, Hyunwook Lee, Paul Rosen, Danielle Albers Szafir, Jinwook Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual clustering is a common perceptual task in scatterplots that supports
+diverse analytics tasks (e.g., cluster identification). However, even with the
+same scatterplot, the ways of perceiving clusters (i.e., conducting visual
+clustering) can differ due to the differences among individuals and ambiguous
+cluster boundaries. Although such perceptual variability casts doubt on the
+reliability of data analysis based on visual clustering, we lack a systematic
+way to efficiently assess this variability. In this research, we study
+perceptual variability in conducting visual clustering, which we call Cluster
+Ambiguity. To this end, we introduce CLAMS, a data-driven visual quality
+measure for automatically predicting cluster ambiguity in monochrome
+scatterplots. We first conduct a qualitative study to identify key factors that
+affect the visual separation of clusters (e.g., proximity or size difference
+between clusters). Based on study findings, we deploy a regression module that
+estimates the human-judged separability of two clusters. Then, CLAMS predicts
+cluster ambiguity by analyzing the aggregated results of all pairwise
+separability between clusters that are generated by the module. CLAMS
+outperforms widely-used clustering techniques in predicting ground truth
+cluster ambiguity. Meanwhile, CLAMS exhibits performance on par with human
+annotators. We conclude our work by presenting two applications for optimizing
+and benchmarking data mining techniques using CLAMS. The interactive demo of
+CLAMS is available at clusterambiguity.dev.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Visualization and Computer Graphics (TVCG)
+  (Proc. IEEE VIS 2023); equally contributed by Hyeon Jeon and Ghulam Jilani
+  Quadri</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZADU: A Python Library for Evaluating the Reliability of Dimensionality
+  Reduction Embeddings <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeon Jeon, Aeri Cho, Jinhwa Jang, Soohyun Lee, Jake Hyun, Hyung-Kwon Ko, Jaemin Jo, Jinwook Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dimensionality reduction (DR) techniques inherently distort the original
+structure of input high-dimensional data, producing imperfect low-dimensional
+embeddings. Diverse distortion measures have thus been proposed to evaluate the
+reliability of DR embeddings. However, implementing and executing distortion
+measures in practice has so far been time-consuming and tedious. To address
+this issue, we present ZADU, a Python library that provides distortion
+measures. ZADU is not only easy to install and execute but also enables
+comprehensive evaluation of DR embeddings through three key features. First,
+the library covers a wide range of distortion measures. Second, it
+automatically optimizes the execution of distortion measures, substantially
+reducing the running time required to execute multiple measures. Last, the
+library informs how individual points contribute to the overall distortions,
+facilitating the detailed analysis of DR embeddings. By simulating a real-world
+scenario of optimizing DR embeddings, we verify that our optimization scheme
+substantially reduces the time required to execute distortion measures.
+Finally, as an application of ZADU, we present another library called ZADUVis
+that allows users to easily create distortion visualizations that depict the
+extent to which each region of an embedding suffers from distortions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 IEEE Visualization and Visual Analytics (IEEE VIS 2023) Short
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Collaboration Analysis applied to Compound <span class="highlight-title">Dataset</span>s and the
+  Introduction of Projection data to Non-IID settings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akihiro Mizoguchi, Anna Bogdanova, Akira Imakura, Tetsuya Sakurai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the time and expense associated with bringing a drug to market,
+numerous studies have been conducted to predict the properties of compounds
+based on their structure using machine learning. Federated learning has been
+applied to compound datasets to increase their prediction accuracy while
+safeguarding potentially proprietary information. However, federated learning
+is encumbered by low accuracy in not identically and independently distributed
+(non-IID) settings, i.e., data partitioning has a large label bias, and is
+considered unsuitable for compound datasets, which tend to have large label
+bias. To address this limitation, we utilized an alternative method of
+distributed machine learning to chemical compound data from open sources,
+called data collaboration analysis (DC). We also proposed data collaboration
+analysis using projection data (DCPd), which is an improved method that
+utilizes auxiliary PubChem data. This improves the quality of individual
+user-side data transformations for the projection data for the creation of
+intermediate representations. The classification accuracy, i.e., area under the
+curve in the receiver operating characteristic curve (ROC-AUC) and AUC in the
+precision-recall curve (PR-AUC), of federated averaging (FedAvg), DC, and DCPd
+was compared for five compound datasets. We determined that the machine
+learning performance for non-IID settings was in the order of DCPd, DC, and
+FedAvg, although they were almost the same in identically and independently
+distributed (IID) settings. Moreover, the results showed that compared to other
+methods, DCPd exhibited a negligible decline in classification accuracy in
+experiments with different degrees of label bias. Thus, DCPd can address the
+low performance in non-IID settings, which is one of the challenges of
+federated learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Positive-Unlabeled Learning via Noise Negative Sample
+  Self-correction <span class="chip">KDD2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchi Zhu, Lu Wang, Pu Zhao, Chao Du, Wei Zhang, Hang Dong, Bo Qiao, Qingwei Lin, Saravan Rajmohan, Dongmei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from positive and unlabeled data is known as positive-unlabeled (PU)
+learning in literature and has attracted much attention in recent years. One
+common approach in PU learning is to sample a set of pseudo-negatives from the
+unlabeled data using ad-hoc thresholds so that conventional supervised methods
+can be applied with both positive and negative samples. Owing to the label
+uncertainty among the unlabeled data, errors of misclassifying unlabeled
+positive samples as negative samples inevitably appear and may even accumulate
+during the training processes. Those errors often lead to performance
+degradation and model instability. To mitigate the impact of label uncertainty
+and improve the robustness of learning with positive and unlabeled data, we
+propose a new robust PU learning method with a training strategy motivated by
+the nature of human learning: easy cases should be learned first. Similar
+intuition has been utilized in curriculum learning to only use easier cases in
+the early stage of training before introducing more complex cases.
+Specifically, we utilize a novel ``hardness'' measure to distinguish unlabeled
+samples with a high chance of being negative from unlabeled samples with large
+label noise. An iterative training strategy is then implemented to fine-tune
+the selection of negative samples during the training process in an iterative
+manner to include more ``easy'' samples in the early stage of training.
+Extensive experimental validations over a wide range of learning tasks show
+that this approach can effectively improve the accuracy and stability of
+learning with positive and unlabeled data. Our code is available at
+https://github.com/woriazzc/Robust-PU
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KDD2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classes are not Clusters: Improving Label-based Evaluation of
+  Dimensionality Reduction <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeon Jeon, Yun-Hsin Kuo, Michaël Aupetit, Kwan-Liu Ma, Jinwook Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common way to evaluate the reliability of dimensionality reduction (DR)
+embeddings is to quantify how well labeled classes form compact, mutually
+separated clusters in the embeddings. This approach is based on the assumption
+that the classes stay as clear clusters in the original high-dimensional space.
+However, in reality, this assumption can be violated; a single class can be
+fragmented into multiple separated clusters, and multiple classes can be merged
+into a single cluster. We thus cannot always assure the credibility of the
+evaluation using class labels. In this paper, we introduce two novel quality
+measures -- Label-Trustworthiness and Label-Continuity (Label-T&C) -- advancing
+the process of DR evaluation based on class labels. Instead of assuming that
+classes are well-clustered in the original space, Label-T&C work by (1)
+estimating the extent to which classes form clusters in the original and
+embedded spaces and (2) evaluating the difference between the two. A
+quantitative evaluation showed that Label-T&C outperform widely used DR
+evaluation measures (e.g., Trustworthiness and Continuity, Kullback-Leibler
+divergence) in terms of the accuracy in assessing how well DR embeddings
+preserve the cluster structure, and are also scalable. Moreover, we present
+case studies demonstrating that Label-T&C can be successfully used for
+revealing the intrinsic characteristics of DR techniques and their
+hyperparameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Transactions on Visualization and Computer Graphics (TVCG)
+  (Proc. IEEE VIS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural approximation of Wasserstein distance via a universal
+  architecture for symmetric and factorwise group invariant functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samantha Chen, Yusu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning distance functions between complex objects, such as the Wasserstein
+distance to compare point sets, is a common goal in machine learning
+applications. However, functions on such complex objects (e.g., point sets and
+graphs) are often required to be invariant to a wide variety of group actions
+e.g. permutation or rigid transformation. Therefore, continuous and symmetric
+product functions (such as distance functions) on such complex objects must
+also be invariant to the product of such group actions. We call these functions
+symmetric and factor-wise group invariant (or SFGI functions in short). In this
+paper, we first present a general neural network architecture for approximating
+SFGI functions. The main contribution of this paper combines this general
+neural network with a sketching idea to develop a specific and efficient neural
+network which can approximate the $p$-th Wasserstein distance between point
+sets. Very importantly, the required model complexity is independent of the
+sizes of input point sets. On the theoretical front, to the best of our
+knowledge, this is the first result showing that there exists a neural network
+with the capacity to approximate Wasserstein distance with bounded model
+complexity. Our work provides an interesting integration of sketching ideas for
+geometric problems with universal approximation of symmetric functions. On the
+empirical front, we present a range of results showing that our newly proposed
+neural network architecture performs comparatively or better than other models
+(including a SOTA Siamese Autoencoder based approach). In particular, our
+neural network generalizes significantly better and trains much faster than the
+SOTA Siamese AE. Finally, this line of investigation could be useful in
+exploring effective neural network design for solving a broad range of
+geometric optimization problems (e.g., $k$-means in a metric space).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modality Multi-Loss Fusion Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehui Wu, Ziwei Gong, Jaywon Koo, Julia Hirschberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we investigate the optimal selection and fusion of features
+across multiple modalities and combine these in a neural network to improve
+emotion detection. We compare different fusion methods and examine the impact
+of multi-loss training within the multi-modality fusion network, identifying
+useful findings relating to subnet performance. Our best model achieves
+state-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and
+CH-SIMS), and outperforms the other methods in most metrics. We have found that
+training on multimodal features improves single modality testing and designing
+fusion methods based on dataset annotation schema enhances model performance.
+These results suggest a roadmap towards an optimized feature selection and
+fusion approach for enhancing emotion detection in neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally to the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Asynchronous Federated Learning with Bidirectional Quantized
+  Communications and Buffered Aggregation <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomas Ortega, Hamid Jafarkhani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Asynchronous Federated Learning with Buffered Aggregation (FedBuff) is a
+state-of-the-art algorithm known for its efficiency and high scalability.
+However, it has a high communication cost, which has not been examined with
+quantized communications. To tackle this problem, we present a new algorithm
+(QAFeL), with a quantization scheme that establishes a shared "hidden" state
+between the server and clients to avoid the error propagation caused by direct
+quantization. This approach allows for high precision while significantly
+reducing the data transmitted during client-server interactions. We provide
+theoretical convergence guarantees for QAFeL and corroborate our analysis with
+experiments on a standard benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2023 ICML Workshop of Federated Learning and
+  Analytics in Practice</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AQUILA: Communication Efficient Federated Learning with Adaptive
+  Quantization of Lazily-Aggregated Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Zhao, Yuzhu Mao, Zhenpeng Shi, Yang Liu, Tian Lan, Wenbo Ding, Xiao-Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread adoption of Federated Learning (FL), a privacy-preserving
+distributed learning methodology, has been impeded by the challenge of high
+communication overheads, typically arising from the transmission of large-scale
+models. Existing adaptive quantization methods, designed to mitigate these
+overheads, operate under the impractical assumption of uniform device
+participation in every training round. Additionally, these methods are limited
+in their adaptability due to the necessity of manual quantization level
+selection and often overlook biases inherent in local devices' data, thereby
+affecting the robustness of the global model. In response, this paper
+introduces AQUILA (adaptive quantization of lazily-aggregated gradients), a
+novel adaptive framework devised to effectively handle these issues, enhancing
+the efficiency and robustness of FL. AQUILA integrates a sophisticated device
+selection method that prioritizes the quality and usefulness of device updates.
+Utilizing the exact global model stored by devices, it enables a more precise
+device selection criterion, reduces model deviation, and limits the need for
+hyperparameter adjustments. Furthermore, AQUILA presents an innovative
+quantization criterion, optimized to improve communication efficiency while
+assuring model convergence. Our experiments demonstrate that AQUILA
+significantly decreases communication costs compared to existing methods, while
+maintaining comparable model performance across diverse non-homogeneous FL
+settings, such as Non-IID data and heterogeneous model architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Best-Subset Selection in Generalized Linear Models: A Fast and
+  Consistent Algorithm via Splicing Technique 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxian Zhu, Jin Zhu, Borui Tang, Xuanyu Chen, Hongmei Lin, Xueqin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In high-dimensional generalized linear models, it is crucial to identify a
+sparse model that adequately accounts for response variation. Although the best
+subset section has been widely regarded as the Holy Grail of problems of this
+type, achieving either computational efficiency or statistical guarantees is
+challenging. In this article, we intend to surmount this obstacle by utilizing
+a fast algorithm to select the best subset with high certainty. We proposed and
+illustrated an algorithm for best subset recovery in regularity conditions.
+Under mild conditions, the computational complexity of our algorithm scales
+polynomially with sample size and dimension. In addition to demonstrating the
+statistical properties of our method, extensive numerical experiments reveal
+that it outperforms existing methods for variable selection and coefficient
+estimation. The runtime analysis shows that our implementation achieves
+approximately a fourfold speedup compared to popular variable selection
+toolkits like glmnet and ncvreg.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EEG-based Cognitive Load Classification using Feature Masked
+  Autoencoding and Emotion Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00246v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00246v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dustin Pulver, Prithila Angkan, Paul Hungler, Ali Etemad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cognitive load, the amount of mental effort required for task completion,
+plays an important role in performance and decision-making outcomes, making its
+classification and analysis essential in various sensitive domains. In this
+paper, we present a new solution for the classification of cognitive load using
+electroencephalogram (EEG). Our model uses a transformer architecture employing
+transfer learning between emotions and cognitive load. We pre-train our model
+using self-supervised masked autoencoding on emotion-related EEG datasets and
+use transfer learning with both frozen weights and fine-tuning to perform
+downstream cognitive load classification. To evaluate our method, we carry out
+a series of experiments utilizing two publicly available EEG-based emotion
+datasets, namely SEED and SEED-IV, for pre-training, while we use the CL-Drive
+dataset for downstream cognitive load classification. The results of our
+experiments show that our proposed approach achieves strong results and
+outperforms conventional single-stage fully supervised learning. Moreover, we
+perform detailed ablation and sensitivity studies to evaluate the impact of
+different aspects of our proposed solution. This research contributes to the
+growing body of literature in affective computing with a focus on cognitive
+load, and opens up new avenues for future research in the field of cross-domain
+transfer learning using self-supervised pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to the 25th International Conference on
+  Multimodal Interaction (ICMI 2023). 8 pages, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Capsa: A Unified Framework for Quantifying Risk in Deep Neural Networks <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadhana Lolla, Iaroslav Elistratov, Alejandro Perez, Elaheh Ahmadi, Daniela Rus, Alexander Amini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The modern pervasiveness of large-scale deep neural networks (NNs) is driven
+by their extraordinary performance on complex problems but is also plagued by
+their sudden, unexpected, and often catastrophic failures, particularly on
+challenging scenarios. Existing algorithms that provide risk-awareness to NNs
+are complex and ad-hoc. Specifically, these methods require significant
+engineering changes, are often developed only for particular settings, and are
+not easily composable. Here we present capsa, a framework for extending models
+with risk-awareness. Capsa provides a methodology for quantifying multiple
+forms of risk and composing different algorithms together to quantify different
+risk metrics in parallel. We validate capsa by implementing state-of-the-art
+uncertainty estimation algorithms within the capsa framework and benchmarking
+them on complex perception datasets. We demonstrate capsa's ability to easily
+compose aleatoric uncertainty, epistemic uncertainty, and bias estimation
+together in a single procedure, and show how this approach provides a
+comprehensive awareness of NN risk.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neural Information Processing Systems (NeurIPS) 2022. Workshop on
+  Machine Learning for Autonomous Driving (ML4AD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instructed to Bias: Instruction-Tuned Language Models Exhibit Emergent
+  Cognitive Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Itay Itzhak, Gabriel Stanovsky, Nir Rosenfeld, Yonatan Belinkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies show that instruction tuning and learning from human feedback
+improve the abilities of large language models (LMs) dramatically. While these
+tuning methods can make models generate high-quality text, we conjecture that
+more implicit cognitive biases may arise in these fine-tuned models. Our work
+provides evidence that these fine-tuned models exhibit biases that were absent
+or less pronounced in their pretrained predecessors. We examine the extent of
+this phenomenon in three cognitive biases - the decoy effect, the certainty
+effect, and the belief bias - all of which are known to influence human
+decision-making and reasoning. Our findings highlight the presence of these
+biases in various models, especially those that have undergone instruction
+tuning, such as Flan-T5, GPT3.5, and GPT4. This research constitutes a step
+toward comprehending cognitive biases in instruction-tuned LMs, which is
+crucial for the development of more reliable and unbiased language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reinforcement Learning-Based Battery Conditioning Hierarchical V2G
+  Coordination for Multi-Stakeholder Benefits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubao Zhang, Xin Chen, Yi Gu, Zhicheng Li, Wu Kai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing prevalence of electric vehicles (EVs) and advancements in EV
+electronics, vehicle-to-grid (V2G) techniques and large-scale scheduling
+strategies have emerged to promote renewable energy utilization and power grid
+stability. This study proposes a multi-stakeholder hierarchical V2G
+coordination based on deep reinforcement learning (DRL) and the Proof of Stake
+algorithm. Furthermore, the multi-stakeholders include the power grid, EV
+aggregators (EVAs), and users, and the proposed strategy can achieve
+multi-stakeholder benefits. On the grid side, load fluctuations and renewable
+energy consumption are considered, while on the EVA side, energy constraints
+and charging costs are considered. The three critical battery conditioning
+parameters of battery SOX are considered on the user side, including state of
+charge, state of power, and state of health. Compared with four typical
+baselines, the multi-stakeholder hierarchical coordination strategy can enhance
+renewable energy consumption, mitigate load fluctuations, meet the energy
+demands of EVA, and reduce charging costs and battery degradation under
+realistic operating conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned
+  Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaochao Zhou, Syed Hasib Akhter Faruqui, Abhinav Patel, Ramez N. Abdalla, Michael C. Hurley, Ali Shaibani, Matthew B. Potts, Babak S. Jahromi, Leon Cho, Sameer A. Ansari, Donald R. Cantrell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many tasks performed in image-guided, mini-invasive, medical procedures can
+be cast as pose estimation problems, where an X-ray projection is utilized to
+reach a target in 3D space. Recent advances in the differentiable rendering of
+optically reflective materials have enabled state-of-the-art performance in RGB
+camera view synthesis and pose estimation. Expanding on these prior works, we
+introduce new methods for pose estimation of radiolucent objects using X-ray
+projections, and we demonstrate the critical role of optimal view synthesis in
+performing this task. We first develop an algorithm (DiffDRR) that efficiently
+computes Digitally Reconstructed Radiographs (DRRs) and leverages automatic
+differentiation within TensorFlow. In conjunction with classic CBCT
+reconstruction algorithms, we perform pose estimation by gradient descent using
+a loss function that quantifies the similarity of the DRR synthesized from a
+randomly initialized pose and the true fluoroscopic image at the target pose.
+We propose two novel methods for high-fidelity view synthesis, Neural Tuned
+Tomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely
+on classic CBCT; NeTT directly optimizes the CBCT densities, while the non-zero
+values of mNeRF are constrained by a 3D mask of the anatomic region segmented
+from CBCT. We demonstrate that both NeTT and mNeRF distinctly improve pose
+estimation within our framework. By defining a successful pose estimate to be a
+3D angle error of less than 3 deg, we find that NeTT and mNeRF can achieve
+similar results, both with overall success rates more than 93%. Furthermore, we
+show that a NeTT trained for a single subject can generalize to synthesize
+high-fidelity DRRs and ensure robust pose estimations for all other subjects.
+Therefore, we suggest that NeTT is an attractive option for robust pose
+estimation using fluoroscopic projections.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SkullGAN: Synthetic Skull CT Generation with Generative Adversarial
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kasra Naftchi-Ardebili, Karanpartap Singh, Reza Pourabolghasem, Pejman Ghanouni, Gerald R. Popelka, Kim Butts Pauly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning offers potential for various healthcare applications involving
+the human skull but requires extensive datasets of curated medical images. To
+overcome this challenge, we propose SkullGAN, a generative adversarial network
+(GAN), to create large datasets of synthetic skull CT slices, reducing reliance
+on real images and accelerating the integration of machine learning into
+healthcare. In our method, CT slices of 38 subjects were fed to SkullGAN, a
+neural network comprising over 200 million parameters. The synthetic skull
+images generated were evaluated based on three quantitative radiological
+features: skull density ratio (SDR), mean thickness, and mean intensity. They
+were further analyzed using t-distributed stochastic neighbor embedding (t-SNE)
+and by applying the SkullGAN discriminator as a classifier. The results showed
+that SkullGAN-generated images demonstrated similar key quantitative
+radiological features to real skulls. Further definitive analysis was
+undertaken by applying the discriminator of SkullGAN, where the SkullGAN
+discriminator classified 56.5% of a test set of real skull images and 55.9% of
+the SkullGAN-generated images as reals (the theoretical optimum being 50%),
+demonstrating that the SkullGAN-generated skull set is indistinguishable from
+the real skull set - within the limits of our nonlinear classifier. Therefore,
+SkullGAN makes it possible to generate large numbers of synthetic skull CT
+segments, necessary for training neural networks for medical applications
+involving the human skull. This mitigates challenges associated with preparing
+large, high-quality training datasets, such as access, capital, time, and the
+need for domain expertise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally. This work has been
+  submitted to Radiology: Artificial Intelligence for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PeRP: Personalized Residual Policies For Congestion Mitigation Through
+  Co-operative Advisory Systems <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aamir Hasan, Neeloy Chakraborty, Haonan Chen, Jung-Hoon Cho, Cathy Wu, Katherine Driggs-Campbell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent driving systems can be used to mitigate congestion through simple
+actions, thus improving many socioeconomic factors such as commute time and gas
+costs. However, these systems assume precise control over autonomous vehicle
+fleets, and are hence limited in practice as they fail to account for
+uncertainty in human behavior. Piecewise Constant (PC) Policies address these
+issues by structurally modeling the likeness of human driving to reduce traffic
+congestion in dense scenarios to provide action advice to be followed by human
+drivers. However, PC policies assume that all drivers behave similarly. To this
+end, we develop a co-operative advisory system based on PC policies with a
+novel driver trait conditioned Personalized Residual Policy, PeRP. PeRP advises
+drivers to behave in ways that mitigate traffic congestion. We first infer the
+driver's intrinsic traits on how they follow instructions in an unsupervised
+manner with a variational autoencoder. Then, a policy conditioned on the
+inferred trait adapts the action of the PC policy to provide the driver with a
+personalized recommendation. Our system is trained in simulation with novel
+driver modeling of instruction adherence. We show that our approach
+successfully mitigates congestion while adapting to different driver behaviors,
+with 4 to 22% improvement in average speed over baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ITSC 2023. Additional material and code is available at
+  the project webpage: https://sites.google.com/illinois.edu/perp</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Activation Patterns in Artificial Neural Networks by
+  Exploring Stochastic Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephan Johann Lehmler, Muhammad Saif-ur-Rehman, Tobias Glasmachers, Ioannis Iossifidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To gain a deeper understanding of the behavior and learning dynamics of
+(deep) artificial neural networks, it is valuable to employ mathematical
+abstractions and models. These tools provide a simplified perspective on
+network performance and facilitate systematic investigations through
+simulations. In this paper, we propose utilizing the framework of stochastic
+processes, which has been underutilized thus far.
+  Our approach models activation patterns of thresholded nodes in (deep)
+artificial neural networks as stochastic processes. We focus solely on
+activation frequency, leveraging neuroscience techniques used for real neuron
+spike trains. During a classification task, we extract spiking activity and use
+an arrival process following the Poisson distribution.
+  We examine observed data from various artificial neural networks in image
+recognition tasks, fitting the proposed model's assumptions. Through this, we
+derive parameters describing activation patterns in each network. Our analysis
+covers randomly initialized, generalizing, and memorizing networks, revealing
+consistent differences across architectures and training sets.
+  Calculating Mean Firing Rate, Mean Fano Factor, and Variances, we find stable
+indicators of memorization during learning, providing valuable insights into
+network behavior. The proposed model shows promise in describing activation
+patterns and could serve as a general framework for future investigations. It
+has potential applications in theoretical simulations, pruning, and transfer
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differential Privacy for Adaptive Weight Aggregation in Federated Tumor
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Irfan Khan, Esa Alhoniemi, Elina Kontio, Suleiman A. Khan, Mojtaba Jafaritadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a distributed machine learning approach that
+safeguards privacy by creating an impartial global model while respecting the
+privacy of individual client data. However, the conventional FL method can
+introduce security risks when dealing with diverse client data, potentially
+compromising privacy and data integrity. To address these challenges, we
+present a differential privacy (DP) federated deep learning framework in
+medical image segmentation. In this paper, we extend our similarity weight
+aggregation (SimAgg) method to DP-SimAgg algorithm, a differentially private
+similarity-weighted aggregation algorithm for brain tumor segmentation in
+multi-modal magnetic resonance imaging (MRI). Our DP-SimAgg method not only
+enhances model segmentation capabilities but also provides an additional layer
+of privacy preservation. Extensive benchmarking and evaluation of our
+framework, with computational performance as a key consideration, demonstrate
+that DP-SimAgg enables accurate and robust brain tumor segmentation while
+minimizing communication costs during model training. This advancement is
+crucial for preserving the privacy of medical image data and safeguarding
+sensitive information. In conclusion, adding a differential privacy layer in
+the global weight aggregation phase of the federated brain tumor segmentation
+provides a promising solution to privacy concerns without compromising
+segmentation model efficacy. By leveraging DP, we ensure the protection of
+client data against adversarial attacks and malicious participants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Study of Groundbreaking Machine Learning Research:
+  Analyzing Highly Cited and Impactful Publications across Six Decades 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Absalom E. Ezugwu, Japie Greeff, Yuh-Shan Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) has emerged as a prominent field of research in
+computer science and other related fields, thereby driving advancements in
+other domains of interest. As the field continues to evolve, it is crucial to
+understand the landscape of highly cited publications to identify key trends,
+influential authors, and significant contributions made thus far. In this
+paper, we present a comprehensive bibliometric analysis of highly cited ML
+publications. We collected a dataset consisting of the top-cited papers from
+reputable ML conferences and journals, covering a period of several years from
+1959 to 2022. We employed various bibliometric techniques to analyze the data,
+including citation analysis, co-authorship analysis, keyword analysis, and
+publication trends. Our findings reveal the most influential papers, highly
+cited authors, and collaborative networks within the machine learning
+community. We identify popular research themes and uncover emerging topics that
+have recently gained significant attention. Furthermore, we examine the
+geographical distribution of highly cited publications, highlighting the
+dominance of certain countries in ML research. By shedding light on the
+landscape of highly cited ML publications, our study provides valuable insights
+for researchers, policymakers, and practitioners seeking to understand the key
+developments and trends in this rapidly evolving field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CASSINI: Network-Aware Job Scheduling in Machine Learning Clusters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudarsanan Rajasekaran, Manya Ghobadi, Aditya Akella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CASSINI, a network-aware job scheduler for machine learning (ML)
+clusters. CASSINI introduces a novel geometric abstraction to consider the
+communication pattern of different jobs while placing them on network links. To
+do so, CASSINI uses an affinity graph that finds a series of time-shift values
+to adjust the communication phases of a subset of jobs, such that the
+communication patterns of jobs sharing the same network link are interleaved
+with each other. Experiments with 13 common ML models on a 24-server testbed
+demonstrate that compared to the state-of-the-art ML schedulers, CASSINI
+improves the average and tail completion time of jobs by up to 1.6x and 2.5x,
+respectively. Moreover, we show that CASSINI reduces the number of ECN marked
+packets in the cluster by up to 33x.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Exact Kernel Equivalence for Finite Classification Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Bell, Michael Geyer, Juston Moore, David Glickenstein, Amanda Fernandez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the equivalence between neural networks and kernel methods by
+deriving the first exact representation of any finite-size parametric
+classification model trained with gradient descent as a kernel machine. We
+compare our exact representation to the well-known Neural Tangent Kernel (NTK)
+and discuss approximation error relative to the NTK and other non-exact path
+kernel formulations. We experimentally demonstrate that the kernel can be
+computed for realistic networks up to machine precision. We use this exact
+kernel to show that our theoretical contribution can provide useful insights
+into the predictions made by neural networks, particularly the way in which
+they generalize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TAG-ML at ICML 2023 in Proceedings. 8 pages, 6 figures, proofs in
+  Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Introduction to Bi-level Optimization: Foundations and Applications
+  in Signal Processing and Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihua Zhang, Prashant Khanduri, Ioannis Tsaknakis, Yuguang Yao, Mingyi Hong, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, bi-level optimization (BLO) has taken center stage in some very
+exciting developments in the area of signal processing (SP) and machine
+learning (ML). Roughly speaking, BLO is a classical optimization problem that
+involves two levels of hierarchy (i.e., upper and lower levels), wherein
+obtaining the solution to the upper-level problem requires solving the
+lower-level one. BLO has become popular largely because it is powerful in
+modeling problems in SP and ML, among others, that involve optimizing nested
+objective functions. Prominent applications of BLO range from resource
+allocation for wireless systems to adversarial machine learning. In this work,
+we focus on a class of tractable BLO problems that often appear in SP and ML
+applications. We provide an overview of some basic concepts of this class of
+BLO problems, such as their optimality conditions, standard algorithms
+(including their optimization principles and practical implementations), as
+well as how they can be leveraged to obtain state-of-the-art results for a
+number of key SP and ML applications. Further, we discuss some recent advances
+in BLO theory, its implications for applications, and point out some
+limitations of the state-of-the-art that require significant future research
+efforts. Overall, we hope that this article can serve to accelerate the
+adoption of BLO as a generic tool to model, analyze, and innovate on a wide
+array of emerging SP applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Spiking Neural Network On Neuromorphic Platform For Human
+  Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sizhen Bian, Michele Magno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy efficiency and low latency are crucial requirements for designing
+wearable AI-empowered human activity recognition systems, due to the hard
+constraints of battery operations and closed-loop feedback. While neural
+network models have been extensively compressed to match the stringent edge
+requirements, spiking neural networks and event-based sensing are recently
+emerging as promising solutions to further improve performance due to their
+inherent energy efficiency and capacity to process spatiotemporal data in very
+low latency. This work aims to evaluate the effectiveness of spiking neural
+networks on neuromorphic processors in human activity recognition for wearable
+applications. The case of workout recognition with wrist-worn wearable motion
+sensors is used as a study. A multi-threshold delta modulation approach is
+utilized for encoding the input sensor data into spike trains to move the
+pipeline into the event-based approach. The spikes trains are then fed to a
+spiking neural network with direct-event training, and the trained model is
+deployed on the research neuromorphic platform from Intel, Loihi, to evaluate
+energy and latency efficiency. Test results show that the spike-based workouts
+recognition system can achieve a comparable accuracy (87.5\%) comparable to the
+popular milliwatt RISC-V bases multi-core processor GAP8 with a traditional
+neural network ( 88.1\%) while achieving two times better energy-delay product
+(0.66 \si{\micro\joule\second} vs. 1.32 \si{\micro\joule\second}).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DYMOND: DYnamic MOtif-NoDes Network Generative Model <span class="chip">WWW '21</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giselle Zeno, Timothy La Fond, Jennifer Neville
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motifs, which have been established as building blocks for network structure,
+move beyond pair-wise connections to capture longer-range correlations in
+connections and activity. In spite of this, there are few generative graph
+models that consider higher-order network structures and even fewer that focus
+on using motifs in models of dynamic graphs. Most existing generative models
+for temporal graphs strictly grow the networks via edge addition, and the
+models are evaluated using static graph structure metrics -- which do not
+adequately capture the temporal behavior of the network. To address these
+issues, in this work we propose DYnamic MOtif-NoDes (DYMOND) -- a generative
+model that considers (i) the dynamic changes in overall graph structure using
+temporal motif activity and (ii) the roles nodes play in motifs (e.g., one node
+plays the hub role in a wedge, while the remaining two act as spokes). We
+compare DYMOND to three dynamic graph generative model baselines on real-world
+networks and show that DYMOND performs better at generating graph structure and
+node behavior similar to the observed network. We also propose a new
+methodology to adapt graph structure metrics to better evaluate the temporal
+aspect of the network. These metrics take into account the changes in overall
+graph structure and the individual nodes' behavior over time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the Web Conference 2021 (WWW '21)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Contrastive <span class="highlight-title">BERT</span> Fine-tuning for Fusion-based
+  <span class="highlight-title">Review</span>ed-Item Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdi Abdollah Pour, Parsa Farinneya, Armin Toroghi, Anton Korikov, Ali Pesaranghader, Touqir Sajed, Manasa Bharadwaj, Borislav Mavrin, Scott Sanner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As natural language interfaces enable users to express increasingly complex
+natural language queries, there is a parallel explosion of user review content
+that can allow users to better find items such as restaurants, books, or movies
+that match these expressive queries. While Neural Information Retrieval (IR)
+methods have provided state-of-the-art results for matching queries to
+documents, they have not been extended to the task of Reviewed-Item Retrieval
+(RIR), where query-review scores must be aggregated (or fused) into item-level
+scores for ranking. In the absence of labeled RIR datasets, we extend Neural IR
+methodology to RIR by leveraging self-supervised methods for contrastive
+learning of BERT embeddings for both queries and reviews. Specifically,
+contrastive learning requires a choice of positive and negative samples, where
+the unique two-level structure of our item-review data combined with meta-data
+affords us a rich structure for the selection of these samples. For contrastive
+learning in a Late Fusion scenario, we investigate the use of positive review
+samples from the same item and/or with the same rating, selection of hard
+positive samples by choosing the least similar reviews from the same anchor
+item, and selection of hard negative samples by choosing the most similar
+reviews from different items. We also explore anchor sub-sampling and
+augmenting with meta-data. For a more end-to-end Early Fusion approach, we
+introduce contrastive item embedding learning to fuse reviews into single item
+embeddings. Experimental results show that Late Fusion contrastive learning for
+Neural RIR outperforms all other contrastive IR configurations, Neural IR, and
+sparse retrieval baselines, thus demonstrating the power of exploiting the
+two-level structure in Neural RIR approaches as well as the importance of
+preserving the nuance of individual review content via Late Fusion methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Bias Amplification Paradox in Text-to-Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preethi Seshadri, Sameer Singh, Yanai Elazar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bias amplification is a phenomenon in which models increase imbalances
+present in the training data. In this paper, we study bias amplification in the
+text-to-image domain using Stable Diffusion by comparing gender ratios in
+training vs. generated images. We find that the model appears to amplify
+gender-occupation biases found in the training data (LAION). However, we
+discover that amplification can largely be attributed to discrepancies between
+training captions and model prompts. For example, an inherent difference is
+that captions from the training data often contain explicit gender information
+while the prompts we use do not, which leads to a distribution shift and
+consequently impacts bias measures. Once we account for various distributional
+differences between texts used for training and generation, we observe that
+amplification decreases considerably. Our findings illustrate the challenges of
+comparing biases in models and the data they are trained on, and highlight
+confounding factors that contribute to bias amplification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mapping Computer Science Research: Trends, Influences, and Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Almutairi, Ozioma Collins Oguine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the current trending research areas in the field of
+Computer Science (CS) and investigates the factors contributing to their
+emergence. Leveraging a comprehensive dataset comprising papers, citations, and
+funding information, we employ advanced machine learning techniques, including
+Decision Tree and Logistic Regression models, to predict trending research
+areas. Our analysis reveals that the number of references cited in research
+papers (Reference Count) plays a pivotal role in determining trending research
+areas making reference counts the most relevant factor that drives trend in the
+CS field. Additionally, the influence of NSF grants and patents on trending
+topics has increased over time. The Logistic Regression model outperforms the
+Decision Tree model in predicting trends, exhibiting higher accuracy,
+precision, recall, and F1 score. By surpassing a random guess baseline, our
+data-driven approach demonstrates higher accuracy and efficacy in identifying
+trending research areas. The results offer valuable insights into the trending
+research areas, providing researchers and institutions with a data-driven
+foundation for decision-making and future research direction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 8 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent-Shift: Gradient of Entropy Helps Neural Codecs <span class="chip">ICIP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammet Balcilar, Bharath Bhushan Damodaran, Karam Naser, Franck Galpin, Pierre Hellier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end image/video codecs are getting competitive compared to traditional
+compression techniques that have been developed through decades of manual
+engineering efforts. These trainable codecs have many advantages over
+traditional techniques such as easy adaptation on perceptual distortion metrics
+and high performance on specific domains thanks to their learning ability.
+However, state of the art neural codecs does not take advantage of the
+existence of gradient of entropy in decoding device. In this paper, we
+theoretically show that gradient of entropy (available at decoder side) is
+correlated with the gradient of the reconstruction error (which is not
+available at decoder side). We then demonstrate experimentally that this
+gradient can be used on various compression methods, leading to a $1-2\%$ rate
+savings for the same quality. Our method is orthogonal to other improvements
+and brings independent rate savings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published to ICIP2023, 6 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Collaborative Filtering with Personalized Time Decay Functions
+  for Financial Product Recommendation <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashraf Ghiye, Baptiste Barreau, Laurent Carlier, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical recommender systems often assume that historical data are
+stationary and fail to account for the dynamic nature of user preferences,
+limiting their ability to provide reliable recommendations in time-sensitive
+settings. This assumption is particularly problematic in finance, where
+financial products exhibit continuous changes in valuations, leading to
+frequent shifts in client interests. These evolving interests, summarized in
+the past client-product interactions, see their utility fade over time with a
+degree that might differ from one client to another. To address this challenge,
+we propose a time-dependent collaborative filtering algorithm that can
+adaptively discount distant client-product interactions using personalized
+decay functions. Our approach is designed to handle the non-stationarity of
+financial data and produce reliable recommendations by modeling the dynamic
+collaborative signals between clients and products. We evaluate our method
+using a proprietary dataset from BNP Paribas and demonstrate significant
+improvements over state-of-the-art benchmarks from relevant literature. Our
+findings emphasize the importance of incorporating time explicitly in the model
+to enhance the accuracy of financial product recommendation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, 2 tables, to be published in the Seventeenth ACM
+  Conference on Recommender Systems (RecSys '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Diffusion Models 2.0: Understanding Diffusion Model
+  Objectives as the ELBO with Simple Data Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00848v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00848v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diederik P. Kingma, Ruiqi Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To achieve the highest perceptual quality, state-of-the-art diffusion models
+are optimized with objectives that look very different from the maximum
+likelihood and the Evidence Lower Bound (ELBO) objectives. In this work, we
+reveal that diffusion model objectives are actually closely related to the
+ELBO.
+  Specifically, we show that all commonly used diffusion model objectives
+equate to a weighted integral of ELBOs over different noise levels, where the
+weighting depends on the specific objective used. Under the condition of
+monotonic weighting, the connection is even closer: the diffusion objective
+then equals the ELBO, combined with simple data augmentation, namely Gaussian
+noise perturbation. We show that this condition holds for a number of
+state-of-the-art diffusion models.
+  In experiments, we explore new monotonic weightings and demonstrate their
+effectiveness, achieving state-of-the-art FID scores on the high-resolution
+ImageNet benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning With Reward Machines in Stochastic Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17372v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17372v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jueming Hu, Jean-Raphael Gaglione, Yanze Wang, Zhe Xu, Ufuk Topcu, Yongming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate multi-agent reinforcement learning for stochastic games with
+complex tasks, where the reward functions are non-Markovian. We utilize reward
+machines to incorporate high-level knowledge of complex tasks. We develop an
+algorithm called Q-learning with reward machines for stochastic games (QRM-SG),
+to learn the best-response strategy at Nash equilibrium for each agent. In
+QRM-SG, we define the Q-function at a Nash equilibrium in augmented state
+space. The augmented state space integrates the state of the stochastic game
+and the state of reward machines. Each agent learns the Q-functions of all
+agents in the system. We prove that Q-functions learned in QRM-SG converge to
+the Q-functions at a Nash equilibrium if the stage game at each time step
+during learning has a global optimum point or a saddle point, and the agents
+update Q-functions based on the best-response strategy at this point. We use
+the Lemke-Howson method to derive the best-response strategy given current
+Q-functions. The three case studies show that QRM-SG can learn the
+best-response strategies effectively. QRM-SG learns the best-response
+strategies after around 7500 episodes in Case Study I, 1000 episodes in Case
+Study II, and 1500 episodes in Case Study III, while baseline methods such as
+Nash Q-learning and MADDPG fail to converge to the Nash equilibrium in all
+three case studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RibSeg v2: A Large-scale Benchmark for Rib Labeling and Anatomical
+  Centerline Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09309v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09309v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Jin, Shixuan Gu, Donglai Wei, Jason Ken Adhinarta, Kaiming Kuang, Yongjie Jessica Zhang, Hanspeter Pfister, Bingbing Ni, Jiancheng Yang, Ming Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic rib labeling and anatomical centerline extraction are common
+prerequisites for various clinical applications. Prior studies either use
+in-house datasets that are inaccessible to communities, or focus on rib
+segmentation that neglects the clinical significance of rib labeling. To
+address these issues, we extend our prior dataset (RibSeg) on the binary rib
+segmentation task to a comprehensive benchmark, named RibSeg v2, with 660 CT
+scans (15,466 individual ribs in total) and annotations manually inspected by
+experts for rib labeling and anatomical centerline extraction. Based on the
+RibSeg v2, we develop a pipeline including deep learning-based methods for rib
+labeling, and a skeletonization-based method for centerline extraction. To
+improve computational efficiency, we propose a sparse point cloud
+representation of CT scans and compare it with standard dense voxel grids.
+Moreover, we design and analyze evaluation metrics to address the key
+challenges of each task. Our dataset, code, and model are available online to
+facilitate open research at https://github.com/M3DV/RibSeg
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Generalizable Tool Use with Non-rigid Grasp-pose Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16499v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16499v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malte Mosbach, Sven Behnke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tool use, a hallmark feature of human intelligence, remains a challenging
+problem in robotics due the complex contacts and high-dimensional action space.
+In this work, we present a novel method to enable reinforcement learning of
+tool use behaviors. Our approach provides a scalable way to learn the operation
+of tools in a new category using only a single demonstration. To this end, we
+propose a new method for generalizing grasping configurations of multi-fingered
+robotic hands to novel objects. This is used to guide the policy search via
+favorable initializations and a shaped reward signal. The learned policies
+solve complex tool use tasks and generalize to unseen tools at test time.
+Visualizations and videos of the trained policies are available at
+https://maltemosbach.github.io/generalizable_tool_use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at IEEE CASE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven low-dimensional dynamic model of Kolmogorov flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos E. Pérez De Jesús, Michael D. Graham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reduced order models (ROMs) that capture flow dynamics are of interest for
+decreasing computational costs for simulation as well as for model-based
+control approaches. This work presents a data-driven framework for
+minimal-dimensional models that effectively capture the dynamics and properties
+of the flow. We apply this to Kolmogorov flow in a regime consisting of chaotic
+and intermittent behavior, which is common in many flows processes and is
+challenging to model. The trajectory of the flow travels near relative periodic
+orbits (RPOs), interspersed with sporadic bursting events corresponding to
+excursions between the regions containing the RPOs. The first step in
+development of the models is use of an undercomplete autoencoder to map from
+the full state data down to a latent space of dramatically lower dimension.
+Then models of the discrete-time evolution of the dynamics in the latent space
+are developed. By analyzing the model performance as a function of latent space
+dimension we can estimate the minimum number of dimensions required to capture
+the system dynamics. To further reduce the dimension of the dynamical model, we
+factor out a phase variable in the direction of translational invariance for
+the flow, leading to separate evolution equations for the pattern and phase. At
+a model dimension of five for the pattern dynamics, as opposed to the full
+state dimension of 1024 (i.e. a 32x32 grid), accurate predictions are found for
+individual trajectories out to about two Lyapunov times, as well as for
+long-time statistics. Further small improvements in the results occur at a
+dimension of nine. The nearly heteroclinic connections between the different
+RPOs, including the quiescent and bursting time scales, are well captured. We
+also capture key features of the phase dynamics. Finally, we use the
+low-dimensional representation to predict future bursting events, finding good
+success.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedGH: Heterogeneous Federated Learning with Generalized Global Header 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13137v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13137v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yi, Gang Wang, Xiaoguang Liu, Zhuan Shi, Han Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is an emerging machine learning paradigm that allows
+multiple parties to train a shared model collaboratively in a
+privacy-preserving manner. Existing horizontal FL methods generally assume that
+the FL server and clients hold the same model structure. However, due to system
+heterogeneity and the need for personalization, enabling clients to hold models
+with diverse structures has become an important direction. Existing
+model-heterogeneous FL approaches often require publicly available datasets and
+incur high communication and/or computational costs, which limit their
+performances. To address these limitations, we propose a simple but effective
+Federated Global prediction Header (FedGH) approach. It is a communication and
+computation-efficient model-heterogeneous FL framework which trains a shared
+generalized global prediction header with representations extracted by
+heterogeneous extractors for clients' models at the FL server. The trained
+generalized global prediction header learns from different clients. The
+acquired global knowledge is then transferred to clients to substitute each
+client's local prediction header. We derive the non-convex convergence rate of
+FedGH. Extensive experiments on two real-world datasets demonstrate that FedGH
+achieves significantly more advantageous performance in both model-homogeneous
+and -heterogeneous FL scenarios compared to seven state-of-the-art personalized
+FL models, beating the best-performing baseline by up to 8.87% (for
+model-homogeneous FL) and 1.83% (for model-heterogeneous FL) in terms of
+average test accuracy, while saving up to 85.53% of communication overhead.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures,accepted by Proceedings of the 31st ACM
+  International Conference on Multimedia (MM 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral learning of multivariate extremes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.07799v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.07799v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Avella Medina, Richard A. Davis, Gennady Samorodnitsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a spectral clustering algorithm for analyzing the dependence
+structure of multivariate extremes. More specifically, we focus on the
+asymptotic dependence of multivariate extremes characterized by the angular or
+spectral measure in extreme value theory. Our work studies the theoretical
+performance of spectral clustering based on a random $k$-nearest neighbor graph
+constructed from an extremal sample, i.e., the angular part of random vectors
+for which the radius exceeds a large threshold. In particular, we derive the
+asymptotic distribution of extremes arising from a linear factor model and
+prove that, under certain conditions, spectral clustering can consistently
+identify the clusters of extremes arising in this model. Leveraging this result
+we propose a simple consistent estimation strategy for learning the angular
+measure. Our theoretical findings are complemented with numerical experiments
+illustrating the finite sample performance of our methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Graphical Factor Models with Riemannian Optimization <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.11950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.11950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Hippert-Ferrer, Florent Bouchard, Ammar Mian, Titouan Vayer, Arnaud Breloy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphical models and factor analysis are well-established tools in
+multivariate statistics. While these models can be both linked to structures
+exhibited by covariance and precision matrices, they are generally not jointly
+leveraged within graph learning processes. This paper therefore addresses this
+issue by proposing a flexible algorithmic framework for graph learning under
+low-rank structural constraints on the covariance matrix. The problem is
+expressed as penalized maximum likelihood estimation of an elliptical
+distribution (a generalization of Gaussian graphical models to possibly
+heavy-tailed distributions), where the covariance matrix is optionally
+constrained to be structured as low-rank plus diagonal (low-rank factor model).
+The resolution of this class of problems is then tackled with Riemannian
+optimization, where we leverage geometries of positive definite matrices and
+positive semi-definite matrices of fixed rank that are well suited to
+elliptical models. Numerical experiments on real-world data sets illustrate the
+effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 11 figures, 1 table, accepted at ECML PKDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAIR for AI: An interdisciplinary and international community building
+  perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.08973v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.08973v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. A. Huerta, Ben Blaiszik, L. Catherine Brinson, Kristofer E. Bouchard, Daniel Diaz, Caterina Doglioni, Javier M. Duarte, Murali Emani, Ian Foster, Geoffrey Fox, Philip Harris, Lukas Heinrich, Shantenu Jha, Daniel S. Katz, Volodymyr Kindratenko, Christine R. Kirkpatrick, Kati Lassila-Perini, Ravi K. Madduri, Mark S. Neubauer, Fotis E. Psomopoulos, Avik Roy, Oliver Rübel, Zhizhen Zhao, Ruike Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A foundational set of findable, accessible, interoperable, and reusable
+(FAIR) principles were proposed in 2016 as prerequisites for proper data
+management and stewardship, with the goal of enabling the reusability of
+scholarly data. The principles were also meant to apply to other digital
+assets, at a high level, and over time, the FAIR guiding principles have been
+re-interpreted or extended to include the software, tools, algorithms, and
+workflows that produce data. FAIR principles are now being adapted in the
+context of AI models and datasets. Here, we present the perspectives, vision,
+and experiences of researchers from different countries, disciplines, and
+backgrounds who are leading the definition and adoption of FAIR principles in
+their communities of practice, and discuss outcomes that may result from
+pursuing and incentivizing FAIR AI research. The material for this report
+builds on the FAIR for AI Workshop held at Argonne National Laboratory on June
+7, 2022.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, comments welcome!; v2: 12 pages, accepted to Scientific
+  Data</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ mCPT at SemEval-2023 Task 3: Multilingual Label-Aware Contrastive
+  <span class="highlight-title">Pre-Train</span>ing of <span class="highlight-title">Transformer</span>s for Few- and Zero-shot Framing Detection <span class="chip">SemEval'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09901v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09901v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Reiter-Haas, Alexander Ertl, Kevin Innerebner, Elisabeth Lex
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the winning system for the zero-shot Spanish framing
+detection task, which also achieves competitive places in eight additional
+languages. The challenge of the framing detection task lies in identifying a
+set of 14 frames when only a few or zero samples are available, i.e., a
+multilingual multi-label few- or zero-shot setting. Our developed solution
+employs a pre-training procedure based on multilingual Transformers using a
+label-aware contrastive loss function. In addition to describing the system, we
+perform an embedding space analysis and ablation study to demonstrate how our
+pre-training procedure supports framing detection to advance computational
+framing analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at SemEval'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention is Not All You Need: Pure Attention Loses Rank Doubly
+  Exponentially with Depth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03404v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03404v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihe Dong, Jean-Baptiste Cordonnier, Andreas Loukas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention-based architectures have become ubiquitous in machine learning, yet
+our understanding of the reasons for their effectiveness remains limited. This
+work proposes a new way to understand self-attention networks: we show that
+their output can be decomposed into a sum of smaller terms, each involving the
+operation of a sequence of attention heads across layers. Using this
+decomposition, we prove that self-attention possesses a strong inductive bias
+towards "token uniformity". Specifically, without skip connections or
+multi-layer perceptrons (MLPs), the output converges doubly exponentially to a
+rank-1 matrix. On the other hand, skip connections and MLPs stop the output
+from degeneration. Our experiments verify the identified convergence phenomena
+on different variants of standard transformer architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How is Chat<span class="highlight-title">GPT</span>'s behavior changing over time? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09009v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09009v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingjiao Chen, Matei Zaharia, James Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GPT-3.5 and GPT-4 are the two most widely used large language model (LLM)
+services. However, when and how these models are updated over time is opaque.
+Here, we evaluate the March 2023 and June 2023 versions of GPT-3.5 and GPT-4 on
+several diverse tasks: 1) math problems, 2) sensitive/dangerous questions, 3)
+opinion surveys, 4) multi-hop knowledge-intensive questions, 5) generating
+code, 6) US Medical License tests, and 7) visual reasoning. We find that the
+performance and behavior of both GPT-3.5 and GPT-4 can vary greatly over time.
+For example, GPT-4 (March 2023) was reasonable at identifying prime vs.
+composite numbers (84% accuracy) but GPT-4 (June 2023) was poor on these same
+questions (51% accuracy). This is partly explained by a drop in GPT-4's amenity
+to follow chain-of-thought prompting. Interestingly, GPT-3.5 was much better in
+June than in March in this task. GPT-4 became less willing to answer sensitive
+questions and opinion survey questions in June than in March. GPT-4 performed
+better at multi-hop questions in June than in March, while GPT-3.5's
+performance dropped on this task. Both GPT-4 and GPT-3.5 had more formatting
+mistakes in code generation in June than in March. Overall, our findings show
+that the behavior of the "same" LLM service can change substantially in a
+relatively short amount of time, highlighting the need for continuous
+monitoring of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>add more evaluations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy against Real-Time Speech Emotion Detection via Acoustic
+  Adversarial Evasion of Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09273v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09273v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Testa, Yi Xiao, Harshit Sharma, Avery Gump, Asif Salekin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Smart speaker voice assistants (VAs) such as Amazon Echo and Google Home have
+been widely adopted due to their seamless integration with smart home devices
+and the Internet of Things (IoT) technologies. These VA services raise privacy
+concerns, especially due to their access to our speech. This work considers one
+such use case: the unaccountable and unauthorized surveillance of a user's
+emotion via speech emotion recognition (SER). This paper presents DARE-GP, a
+solution that creates additive noise to mask users' emotional information while
+preserving the transcription-relevant portions of their speech. DARE-GP does
+this by using a constrained genetic programming approach to learn the spectral
+frequency traits that depict target users' emotional content, and then
+generating a universal adversarial audio perturbation that provides this
+privacy protection. Unlike existing works, DARE-GP provides: a) real-time
+protection of previously unheard utterances, b) against previously unseen
+black-box SER classifiers, c) while protecting speech transcription, and d)
+does so in a realistic, acoustic environment. Further, this evasion is robust
+against defenses employed by a knowledgeable adversary. The evaluations in this
+work culminate with acoustic evaluations against two off-the-shelf commercial
+smart speakers using a small-form-factor (raspberry pi) integrated with a
+wake-word system to evaluate the efficacy of its real-world, real-time
+deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Neural Network Training for Hyperbox-Based Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09269v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09269v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Denis Mayr Lima Martins, Christian Lülf, Fabian Gieseke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperbox-based classification has been seen as a promising technique in which
+decisions on the data are represented as a series of orthogonal,
+multidimensional boxes (i.e., hyperboxes) that are often interpretable and
+human-readable. However, existing methods are no longer capable of efficiently
+handling the increasing volume of data many application domains face nowadays.
+We address this gap by proposing a novel, fully differentiable framework for
+hyperbox-based classification via neural networks. In contrast to previous
+work, our hyperbox models can be efficiently trained in an end-to-end fashion,
+which leads to significantly reduced training times and superior classification
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, accepted for poster presentation at ESANN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Jina Embeddings: A Novel Set of High-Performance Sentence Embedding
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Günther, Louis Milliken, Jonathan Geuter, Georgios Mastrapas, Bo Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jina Embeddings constitutes a set of high-performance sentence embedding
+models adept at translating various textual inputs into numerical
+representations, thereby capturing the semantic essence of the text. The models
+excel in applications such as dense retrieval and semantic textual similarity.
+This paper details the development of Jina Embeddings, starting with the
+creation of high-quality pairwise and triplet datasets. It underlines the
+crucial role of data cleaning in dataset preparation, gives in-depth insights
+into the model training process, and concludes with a comprehensive performance
+evaluation using the Massive Textual Embedding Benchmark (MTEB). To increase
+the model's awareness of negations, we constructed a novel training and
+evaluation dataset of negated and non-negated statements, which we make
+publicly available to the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 page appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Integrated Multi-Time-Scale Modeling for Solar Irradiance Forecasting
+  Using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1905.02616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1905.02616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sakshi Mishra, Praveen Palanisamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For short-term solar irradiance forecasting, the traditional point
+forecasting methods are rendered less useful due to the non-stationary
+characteristic of solar power. The amount of operating reserves required to
+maintain reliable operation of the electric grid rises due to the variability
+of solar energy. The higher the uncertainty in the generation, the greater the
+operating-reserve requirements, which translates to an increased cost of
+operation. In this research work, we propose a unified architecture for
+multi-time-scale predictions for intra-day solar irradiance forecasting using
+recurrent neural networks (RNN) and long-short-term memory networks (LSTMs).
+This paper also lays out a framework for extending this modeling approach to
+intra-hour forecasting horizons thus, making it a multi-time-horizon
+forecasting approach, capable of predicting intra-hour as well as intra-day
+solar irradiance. We develop an end-to-end pipeline to effectuate the proposed
+architecture. The performance of the prediction model is tested and validated
+by the methodical implementation. The robustness of the approach is
+demonstrated with case studies conducted for geographically scattered sites
+across the United States. The predictions demonstrate that our proposed unified
+architecture-based approach is effective for multi-time-scale solar forecasts
+and achieves a lower root-mean-square prediction error when benchmarked against
+the best-performing methods documented in the literature that use separate
+models for each time-scale during the day. Our proposed method results in a
+71.5% reduction in the mean RMSE averaged across all the test sites compared to
+the ML-based best-performing method reported in the literature. Additionally,
+the proposed method enables multi-time-horizon forecasts with real-time inputs,
+which have a significant potential for practical industry applications in the
+evolving grid.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures, 3 tables, under review for journal submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transfer-Learning Across <span class="highlight-title">Dataset</span>s with Different Input Dimensions: An
+  Algorithm and Analysis for the Linear Regression Case 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.05069v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.05069v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Pedro Silvestrin, Harry van Zanten, Mark Hoogendoorn, Ger Koole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of new sensors and monitoring devices, more sources of
+data become available to be used as inputs for machine learning models. These
+can on the one hand help to improve the accuracy of a model. On the other hand
+however, combining these new inputs with historical data remains a challenge
+that has not yet been studied in enough detail. In this work, we propose a
+transfer-learning algorithm that combines the new and the historical data, that
+is especially beneficial when the new data is scarce. We focus the approach on
+the linear regression case, which allows us to conduct a rigorous theoretical
+study on the benefits of the approach. We show that our approach is robust
+against negative transfer-learning, and we confirm this result empirically with
+real and simulated data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at
+  https://github.com/lpsilvestrin/incremental_input_tl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beating Backdoor Attack at Its Own Game 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Alberto Sangiovanni-Vincentelli, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not
+affect the network's performance on clean data but would manipulate the network
+behavior once a trigger pattern is added. Existing defense methods have greatly
+reduced attack success rate, but their prediction accuracy on clean data still
+lags behind a clean model by a large margin. Inspired by the stealthiness and
+effectiveness of backdoor attack, we propose a simple but highly effective
+defense framework which injects non-adversarial backdoors targeting poisoned
+samples. Following the general steps in backdoor attack, we detect a small set
+of suspected samples and then apply a poisoning strategy to them. The
+non-adversarial backdoor, once triggered, suppresses the attacker's backdoor on
+poisoned data, but has limited influence on clean data. The defense can be
+carried out during data preprocessing, without any modification to the standard
+end-to-end training pipeline. We conduct extensive experiments on multiple
+benchmarks with different architectures and representative attacks. Results
+demonstrate that our method achieves state-of-the-art defense effectiveness
+with by far the lowest performance drop on clean data. Considering the
+surprising defense ability displayed by our framework, we call for more
+attention to utilizing backdoor for backdoor defense. Code is available at
+https://github.com/damianliumin/non-adversarial_backdoor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provable convergence guarantees for black-box variational inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03638v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03638v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Domke, Guillaume Garrigos, Robert Gower
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While black-box variational inference is widely used, there is no proof that
+its stochastic optimization succeeds. We suggest this is due to a theoretical
+gap in existing stochastic optimization proofs-namely the challenge of gradient
+estimators with unusual noise bounds, and a composite non-smooth objective. For
+dense Gaussian variational families, we observe that existing gradient
+estimators based on reparameterization satisfy a quadratic noise bound and give
+novel convergence guarantees for proximal and projected stochastic gradient
+descent using this bound. This provides the first rigorous guarantee that
+black-box variational inference converges for realistic inference problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence Rates for Non-Log-Concave Sampling and Log-Partition
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.03237v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.03237v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Holzmüller, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sampling from Gibbs distributions $p(x) \propto \exp(-V(x)/\varepsilon)$ and
+computing their log-partition function are fundamental tasks in statistics,
+machine learning, and statistical physics. However, while efficient algorithms
+are known for convex potentials $V$, the situation is much more difficult in
+the non-convex case, where algorithms necessarily suffer from the curse of
+dimensionality in the worst case. For optimization, which can be seen as a
+low-temperature limit of sampling, it is known that smooth functions $V$ allow
+faster convergence rates. Specifically, for $m$-times differentiable functions
+in $d$ dimensions, the optimal rate for algorithms with $n$ function
+evaluations is known to be $O(n^{-m/d})$, where the constant can potentially
+depend on $m, d$ and the function to be optimized. Hence, the curse of
+dimensionality can be alleviated for smooth functions at least in terms of the
+convergence rate. Recently, it has been shown that similarly fast rates can
+also be achieved with polynomial runtime $O(n^{3.5})$, where the exponent $3.5$
+is independent of $m$ or $d$. Hence, it is natural to ask whether similar rates
+for sampling and log-partition computation are possible, and whether they can
+be realized in polynomial time with an exponent independent of $m$ and $d$. We
+show that the optimal rates for sampling and log-partition computation are
+sometimes equal and sometimes faster than for optimization. We then analyze
+various polynomial-time sampling algorithms, including an extension of a recent
+promising optimization approach, and find that they sometimes exhibit
+interesting behavior but no near-optimal rates. Our results also give further
+insights on the relation between sampling, log-partition, and optimization
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Changes in v3: Minor corrections and improvements. Plots can be
+  reproduced using the code at
+  https://github.com/dholzmueller/sampling_experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Framework and Benchmark for Deep Batch Active Learning for Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.09410v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.09410v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Holzmüller, Viktor Zaverkin, Johannes Kästner, Ingo Steinwart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The acquisition of labels for supervised learning can be expensive. To
+improve the sample efficiency of neural network regression, we study active
+learning methods that adaptively select batches of unlabeled data for labeling.
+We present a framework for constructing such methods out of (network-dependent)
+base kernels, kernel transformations, and selection methods. Our framework
+encompasses many existing Bayesian methods based on Gaussian process
+approximations of neural networks as well as non-Bayesian methods.
+Additionally, we propose to replace the commonly used last-layer features with
+sketched finite-width neural tangent kernels and to combine them with a novel
+clustering method. To evaluate different methods, we introduce an open-source
+benchmark consisting of 15 large tabular regression data sets. Our proposed
+method outperforms the state-of-the-art on our benchmark, scales to large data
+sets, and works out-of-the-box without adjusting the network architecture or
+training code. We provide open-source code that includes efficient
+implementations of all kernels, kernel transformations, and selection methods,
+and can be used for reproducing our results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the Journal of Machine Learning Research (JMLR). Changes
+  in v4: Improvements in writing and other minor changes. Accompanying code can
+  be found at https://github.com/dholzmueller/bmdal_reg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Noisy Interpolation Learning with Shallow Univariate ReLU Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15396v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15396v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirmit Joshi, Gal Vardi, Nathan Srebro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the asymptotic overfitting behavior of interpolation with minimum
+norm ($\ell_2$ of the weights) two-layer ReLU networks for noisy univariate
+regression. We show that overfitting is tempered for the $L_1$ loss, and any
+$L_p$ loss for $p<2$, but catastrophic for $p\geq 2$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added a reference to a related paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyper-Laplacian Regularized Concept Factorization in Low-rank Tensor
+  Space for Multi-view Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixiao Yu, Lele Fu, Zhiling Cai, Zhoumin Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor-oriented multi-view subspace clustering has achieved significant
+strides in assessing high-order correlations and improving clustering analysis
+of multi-view data. Nevertheless, most of existing investigations are typically
+hampered by the two flaws. First, self-representation based tensor subspace
+learning usually induces high time and space complexity, and is limited in
+perceiving nonlinear local structure in the embedding space. Second, the tensor
+singular value decomposition (t-SVD) model redistributes each singular value
+equally without considering the diverse importance among them. To well cope
+with the issues, we propose a hyper-Laplacian regularized concept factorization
+(HLRCF) in low-rank tensor space for multi-view clustering. Specifically, we
+adopt the concept factorization to explore the latent cluster-wise
+representation of each view. Further, the hypergraph Laplacian regularization
+endows the model with the capability of extracting the nonlinear local
+structures in the latent space. Considering that different tensor singular
+values associate structural information with unequal importance, we develop a
+self-weighted tensor Schatten p-norm to constrain the tensor comprised of all
+cluster-wise representations. Notably, the tensor with smaller size greatly
+decreases the time and space complexity in the low-rank optimization. Finally,
+experimental results on eight benchmark datasets exhibit that HLRCF outperforms
+other multi-view methods, showingcasing its superior performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>We intend to continue refining the theoretical analysis and
+  experimental validation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Universality of the Double Descent Peak in Ridgeless Regression <span class="chip">ICLR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.01851v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.01851v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Holzmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove a non-asymptotic distribution-independent lower bound for the
+expected mean squared generalization error caused by label noise in ridgeless
+linear regression. Our lower bound generalizes a similar known result to the
+overparameterized (interpolating) regime. In contrast to most previous works,
+our analysis applies to a broad class of input distributions with almost surely
+full-rank feature matrices, which allows us to cover various types of
+deterministic or random feature maps. Our lower bound is asymptotically sharp
+and implies that in the presence of label noise, ridgeless linear regression
+does not perform well around the interpolation threshold for any of these
+feature maps. We analyze the imposed assumptions in detail and provide a theory
+for analytic (random) feature maps. Using this theory, we can show that our
+assumptions are satisfied for input distributions with a (Lebesgue) density and
+feature maps given by random deep neural networks with analytic activation
+functions like sigmoid, tanh, softplus or GELU. As further examples, we show
+that feature maps from random Fourier features and polynomial kernels also
+satisfy our assumptions. We complement our theory with further experimental and
+analytic results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR 2021. 9 pages + 34 pages appendix. Changes in v8:
+  Small corrections. Experimental results can be reproduced using the code at
+  https://github.com/dholzmueller/universal_double_descent</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accurate generation of stochastic dynamics based on multi-model
+  Generative Adversarial Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15920v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15920v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Lanzoni, Olivier Pierre-Louis, Francesco Montalenti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have shown immense potential in fields
+such as text and image generation. Only very recently attempts to exploit GANs
+to statistical-mechanics models have been reported. Here we quantitatively test
+this approach by applying it to a prototypical stochastic process on a lattice.
+By suitably adding noise to the original data we succeed in bringing both the
+Generator and the Discriminator loss functions close to their ideal value.
+Importantly, the discreteness of the model is retained despite the noise. As
+typical for adversarial approaches, oscillations around the convergence limit
+persist also at large epochs. This undermines model selection and the quality
+of the generated trajectories. We demonstrate that a simple multi-model
+procedure where stochastic trajectories are advanced at each step upon randomly
+selecting a Generator leads to a remarkable increase in accuracy. This is
+illustrated by quantitative analysis of both the predicted equilibrium
+probability distribution and of the escape-time distribution. Based on the
+reported findings, we believe that GANs are a promising tool to tackle complex
+statistical dynamics by machine learning techniques
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main text and appendices, 10 pages and 10 figures Updated version:
+  citations to previous work which was not known to the authors have been
+  added, text has been re-organized and modified accordingly; supplemental
+  material has been moved into appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Verifiable Goal Recognition for Autonomous Driving with Occlusions <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14163v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14163v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cillian Brewitt, Massimiliano Tamborski, Cheng Wang, Stefano V. Albrecht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal recognition (GR) involves inferring the goals of other vehicles, such as
+a certain junction exit, which can enable more accurate prediction of their
+future behaviour. In autonomous driving, vehicles can encounter many different
+scenarios and the environment may be partially observable due to occlusions. We
+present a novel GR method named Goal Recognition with Interpretable Trees under
+Occlusion (OGRIT). OGRIT uses decision trees learned from vehicle trajectory
+data to infer the probabilities of a set of generated goals. We demonstrate
+that OGRIT can handle missing data due to occlusions and make inferences across
+multiple scenarios using the same learned decision trees, while being
+computationally fast, accurate, interpretable and verifiable. We also release
+the inDO, rounDO and OpenDDO datasets of occluded regions used to evaluate
+OGRIT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/RSJ International Conference on Intelligent Robots and Systems
+  (IROS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Multi-objective Policy Optimization as a Tool for Reinforcement
+  Learning: Case Studies in Offline RL and Finetuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.08199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.08199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abbas Abdolmaleki, Sandy H. Huang, Giulia Vezzani, Bobak Shahriari, Jost Tobias Springenberg, Shruti Mishra, Dhruva TB, Arunkumar Byravan, Konstantinos Bousmalis, Andras Gyorgy, Csaba Szepesvari, Raia Hadsell, Nicolas Heess, Martin Riedmiller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many advances that have improved the robustness and efficiency of deep
+reinforcement learning (RL) algorithms can, in one way or another, be
+understood as introducing additional objectives or constraints in the policy
+optimization step. This includes ideas as far ranging as exploration bonuses,
+entropy regularization, and regularization toward teachers or data priors.
+Often, the task reward and auxiliary objectives are in conflict, and in this
+paper we argue that this makes it natural to treat these cases as instances of
+multi-objective (MO) optimization problems. We demonstrate how this perspective
+allows us to develop novel and more effective RL algorithms. In particular, we
+focus on offline RL and finetuning as case studies, and show that existing
+approaches can be understood as MO algorithms relying on linear scalarization.
+We hypothesize that replacing linear scalarization with a better algorithm can
+improve performance. We introduce Distillation of a Mixture of Experts (DiME),
+a new MORL algorithm that outperforms linear scalarization and can be applied
+to these non-standard MO problems. We demonstrate that for offline RL, DiME
+leads to a simple new algorithm that outperforms state-of-the-art. For
+finetuning, we derive new algorithms that learn to outperform the teacher
+policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kernel interpolation generalizes poorly 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15809v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15809v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Li, Haobo Zhang, Qian Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most interesting problems in the recent renaissance of the studies
+in kernel regression might be whether the kernel interpolation can generalize
+well, since it may help us understand the `benign overfitting henomenon'
+reported in the literature on deep networks. In this paper, under mild
+conditions, we show that for any $\varepsilon>0$, the generalization error of
+kernel interpolation is lower bounded by $\Omega(n^{-\varepsilon})$. In other
+words, the kernel interpolation generalizes poorly for a large class of
+kernels. As a direct corollary, we can show that overfitted wide neural
+networks defined on the sphere generalize poorly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Local) Differential Privacy has NO Disparate Impact on Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12845v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12845v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Héber H. Arcolezi, Karima Makhlouf, Catuscia Palamidessi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Local Differential Privacy (LDP), a robust
+privacy-preserving methodology, has gained widespread adoption in real-world
+applications. With LDP, users can perturb their data on their devices before
+sending it out for analysis. However, as the collection of multiple sensitive
+information becomes more prevalent across various industries, collecting a
+single sensitive attribute under LDP may not be sufficient. Correlated
+attributes in the data may still lead to inferences about the sensitive
+attribute. This paper empirically studies the impact of collecting multiple
+sensitive attributes under LDP on fairness. We propose a novel privacy budget
+allocation scheme that considers the varying domain size of sensitive
+attributes. This generally led to a better privacy-utility-fairness trade-off
+in our experiments than the state-of-art solution. Our results show that LDP
+leads to slightly improved fairness in learning problems without significantly
+affecting the performance of the models. We conduct extensive experiments
+evaluating three benchmark datasets using several group fairness metrics and
+seven state-of-the-art LDP protocols. Overall, this study challenges the common
+belief that differential privacy necessarily leads to worsened fairness in
+machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Best paper award at DBSec'23. Version of record at
+  https://doi.org/10.1007/978-3-031-37586-6_1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChiPFormer: Transferable Chip Placement via Offline Decision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Lai, Jinxin Liu, Zhentao Tang, Bin Wang, Jianye Hao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Placement is a critical step in modern chip design, aiming to determine the
+positions of circuit modules on the chip canvas. Recent works have shown that
+reinforcement learning (RL) can improve human performance in chip placement.
+However, such an RL-based approach suffers from long training time and low
+transfer ability in unseen chip circuits. To resolve these challenges, we cast
+the chip placement as an offline RL formulation and present ChiPFormer that
+enables learning a transferable placement policy from fixed offline data.
+ChiPFormer has several advantages that prior arts do not have. First,
+ChiPFormer can exploit offline placement designs to learn transferable policies
+more efficiently in a multi-task setting. Second, ChiPFormer can promote
+effective finetuning for unseen chip circuits, reducing the placement runtime
+from hours to minutes. Third, extensive experiments on 32 chip circuits
+demonstrate that ChiPFormer achieves significantly better placement quality
+while reducing the runtime by 10x compared to recent state-of-the-art
+approaches in both public benchmarks and realistic industrial tasks. The
+deliverables are released at https://sites.google.com/view/chipformer/home.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Discovery and Knowledge Injection for Contestable Neural Networks
+  (with Appendices) <span class="chip">ECAI23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09787v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09787v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabrizio Russo, Francesca Toni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have proven to be effective at solving machine learning tasks
+but it is unclear whether they learn any relevant causal relationships, while
+their black-box nature makes it difficult for modellers to understand and debug
+them. We propose a novel method overcoming these issues by allowing a two-way
+interaction whereby neural-network-empowered machines can expose the
+underpinning learnt causal graphs and humans can contest the machines by
+modifying the causal graphs before re-injecting them into the machines. The
+learnt models are guaranteed to conform to the graphs and adhere to expert
+knowledge, some of which can also be given up-front. By building a window into
+the model behaviour and enabling knowledge injection, our method allows
+practitioners to debug networks based on the causal structure discovered from
+the data and underpinning the predictions. Experiments with real and synthetic
+tabular data show that our method improves predictive performance up to 2.4x
+while producing parsimonious networks, up to 7x smaller in the input layer,
+compared to SOTA regularised networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECAI23 - Version with Appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MARIO: Model Agnostic Recipe for Improving OOD Generalization of Graph
+  Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13055v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13055v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Zhu, Haizhou Shi, Zhenshuo Zhang, Siliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the problem of out-of-distribution (OOD)
+generalization for unsupervised learning methods on graph data. This scenario
+is particularly challenging because graph neural networks (GNNs) have been
+shown to be sensitive to distributional shifts, even when labels are available.
+To address this challenge, we propose a \underline{M}odel-\underline{A}gnostic
+\underline{R}ecipe for \underline{I}mproving \underline{O}OD generalizability
+of unsupervised graph contrastive learning methods, which we refer to as MARIO.
+MARIO introduces two principles aimed at developing distributional-shift-robust
+graph contrastive methods to overcome the limitations of existing frameworks:
+(i) Information Bottleneck (IB) principle for achieving generalizable
+representations and (ii) Invariant principle that incorporates adversarial data
+augmentation to obtain invariant representations. To the best of our knowledge,
+this is the first work that investigates the OOD generalization problem of
+graph contrastive learning, with a specific focus on node-level tasks. Through
+extensive experiments, we demonstrate that our method achieves state-of-the-art
+performance on the OOD test set, while maintaining comparable performance on
+the in-distribution test set when compared to existing approaches. The source
+code for our method can be found at: https://github.com/ZhuYun97/MARIO
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Double-Edged Sword of Big Data and Information Technology for the
+  Disadvantaged: A Cautionary Tale from Open Banking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savina Dine Kim, Galina Andreeva, Michael Rovatsos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research article analyses and demonstrates the hidden implications for
+fairness of seemingly neutral data coupled with powerful technology, such as
+machine learning (ML), using Open Banking as an example. Open Banking has
+ignited a revolution in financial services, opening new opportunities for
+customer acquisition, management, retention, and risk assessment. However, the
+granularity of transaction data holds potential for harm where unnoticed
+proxies for sensitive and prohibited characteristics may lead to indirect
+discrimination. Against this backdrop, we investigate the dimensions of
+financial vulnerability (FV), a global concern resulting from COVID-19 and
+rising inflation. Specifically, we look to understand the behavioral elements
+leading up to FV and its impact on at-risk, disadvantaged groups through the
+lens of fair interpretation. Using a unique dataset from a UK FinTech lender,
+we demonstrate the power of fine-grained transaction data while simultaneously
+cautioning its safe usage. Three ML classifiers are compared in predicting the
+likelihood of FV, and groups exhibiting different magnitudes and forms of FV
+are identified via clustering to highlight the effects of feature combination.
+Our results indicate that engineered features of financial behavior can be
+predictive of omitted personal information, particularly sensitive or protected
+characteristics, shedding light on the hidden dangers of Open Banking data. We
+discuss the implications and conclude fairness via unawareness is ineffective
+in this new technological environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diable: Efficient Dialogue State Tracking as Operations on Tables <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Lesci, Yoshinari Fujinuma, Momchil Hardalov, Chao Shang, Lluis Marquez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequence-to-sequence state-of-the-art systems for dialogue state tracking
+(DST) use the full dialogue history as input, represent the current state as a
+list with all the slots, and generate the entire state from scratch at each
+dialogue turn. This approach is inefficient, especially when the number of
+slots is large and the conversation is long. We propose Diable, a new task
+formalisation that simplifies the design and implementation of efficient DST
+systems and allows one to easily plug and play large language models. We
+represent the dialogue state as a table and formalise DST as a table
+manipulation task. At each turn, the system updates the previous state by
+generating table operations based on the dialogue context. Extensive
+experimentation on the MultiWoz datasets demonstrates that Diable (i)
+outperforms strong efficient DST baselines, (ii) is 2.4x more time efficient
+than current state-of-the-art methods while retaining competitive Joint Goal
+Accuracy, and (iii) is robust to noisy data annotations due to the table
+operations approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2023 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lifelong Reinforcement Learning with Modulating Masks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.11110v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.11110v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eseoghene Ben-Iwhiwhu, Saptarshi Nath, Praveen K. Pilly, Soheil Kolouri, Andrea Soltoggio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifelong learning aims to create AI systems that continuously and
+incrementally learn during a lifetime, similar to biological learning. Attempts
+so far have met problems, including catastrophic forgetting, interference among
+tasks, and the inability to exploit previous knowledge. While considerable
+research has focused on learning multiple supervised classification tasks that
+involve changes in the input distribution, lifelong reinforcement learning
+(LRL) must deal with variations in the state and transition distributions, and
+in the reward functions. Modulating masks with a fixed backbone network,
+recently developed for classification, are particularly suitable to deal with
+such a large spectrum of task variations. In this paper, we adapted modulating
+masks to work with deep LRL, specifically PPO and IMPALA agents. The comparison
+with LRL baselines in both discrete and continuous RL tasks shows superior
+performance. We further investigated the use of a linear combination of
+previously learned masks to exploit previous knowledge when learning new tasks:
+not only is learning faster, the algorithm solves tasks that we could not
+otherwise solve from scratch due to extremely sparse rewards. The results
+suggest that RL with modulating masks is a promising approach to lifelong
+learning, to the composition of knowledge to learn increasingly complex tasks,
+and to knowledge reuse for efficient and faster learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/dlpbc/mask-lrl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Multimodal Knowledge Graph Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Ningyu Zhang, Jintian Zhang, Xiaohan Wang, Tongtong Wu, Xi Chen, Yongheng Wang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Knowledge Graph Construction (MKGC) involves creating structured
+representations of entities and relations using multiple modalities, such as
+text and images. However, existing MKGC models face challenges in handling the
+addition of new entities and relations in dynamic real-world scenarios. The
+current continual setting for knowledge graph construction mainly focuses on
+entity and relation extraction from text data, overlooking other multimodal
+sources. Therefore, there arises the need to explore the challenge of continual
+MKGC to address the phenomenon of catastrophic forgetting and ensure the
+retention of past knowledge extracted from different forms of data. This
+research focuses on investigating this complex topic by developing lifelong
+MKGC benchmark datasets. Based on the empirical findings that several typical
+MKGC models, when trained on multimedia data, might unexpectedly underperform
+compared to those solely utilizing textual resources in a continual setting, we
+propose a Lifelong MultiModal Consistent Transformer Framework (LMC) for
+continual MKGC, which plays the strengths of the consistent multimodal
+optimization in continual learning and leads to a better stability-plasticity
+trade-off. Our experiments demonstrate the superior performance of our method
+over prevailing continual learning techniques or multimodal approaches in
+dynamic scenarios. Code and datasets can be found at
+https://github.com/zjunlp/ContinueMKGC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat with the Environment: Interactive Multimodal Perception Using Large
+  Language Models <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08268v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08268v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xufeng Zhao, Mengdi Li, Cornelius Weber, Muhammad Burhan Hafez, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programming robot behavior in a complex world faces challenges on multiple
+levels, from dextrous low-level skills to high-level planning and reasoning.
+Recent pre-trained Large Language Models (LLMs) have shown remarkable reasoning
+ability in few-shot robotic planning. However, it remains challenging to ground
+LLMs in multimodal sensory input and continuous action output, while enabling a
+robot to interact with its environment and acquire novel information as its
+policies unfold. We develop a robot interaction scenario with a partially
+observable state, which necessitates a robot to decide on a range of epistemic
+actions in order to sample sensory information among multiple modalities,
+before being able to execute the task correctly. An interactive perception
+framework is therefore proposed with an LLM as its backbone, whose ability is
+exploited to instruct epistemic actions and to reason over the resulting
+multimodal sensations (vision, sound, haptics, proprioception), as well as to
+plan an entire task execution based on the interactively acquired information.
+Our study demonstrates that LLMs can provide high-level planning and reasoning
+skills and control interactive robot behavior in a multimodal environment,
+while multimodal modules with the context of the environmental state help
+ground the LLMs and extend their processing ability. The project website can be
+found at
+\href{https://matcha-model.github.io}{\textcolor{blue}{https://matcha-model.github.io/}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS2023, Detroit. See the project website at
+  https://matcha-model.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedNoRo: Towards Noise-Robust Federated Learning by Addressing Class
+  Imbalance and Label Noise Heterogeneity <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05230v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05230v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nannan Wu, Li Yu, Xuefeng Jiang, Kwang-Ting Cheng, Zengqiang Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated noisy label learning (FNLL) is emerging as a promising tool for
+privacy-preserving multi-source decentralized learning. Existing research,
+relying on the assumption of class-balanced global data, might be incapable to
+model complicated label noise, especially in medical scenarios. In this paper,
+we first formulate a new and more realistic federated label noise problem where
+global data is class-imbalanced and label noise is heterogeneous, and then
+propose a two-stage framework named FedNoRo for noise-robust federated
+learning. Specifically, in the first stage of FedNoRo, per-class loss
+indicators followed by Gaussian Mixture Model are deployed for noisy client
+identification. In the second stage, knowledge distillation and a
+distance-aware aggregation function are jointly adopted for noise-robust
+federated model updating. Experimental results on the widely-used ICH and
+ISIC2019 datasets demonstrate the superiority of FedNoRo against the
+state-of-the-art FNLL methods for addressing class imbalance and label noise
+heterogeneity in real-world FL scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCAI 2023 (Main Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-tuning Happens in Tiny Subspaces: Exploring Intrinsic Task-specific
+  Subspaces of <span class="highlight-title">Pre-train</span>ed Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Zhang, Bang Liu, Junming Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLMs) are known to be overly parameterized and
+have significant redundancy, indicating a small degree of freedom of the PLMs.
+Motivated by the observation, in this paper, we study the problem of
+re-parameterizing and fine-tuning PLMs from a new perspective: Discovery of
+intrinsic task-specific subspace. Specifically, by exploiting the dynamics of
+the fine-tuning process for a given task, the parameter optimization trajectory
+is learned to uncover its intrinsic task-specific subspace. A key finding is
+that PLMs can be effectively fine-tuned in the subspace with a small number of
+free parameters. Beyond, we observe some outlier dimensions emerging during
+fine-tuning in the subspace. Disabling these dimensions degrades the model
+performance significantly. This suggests that these dimensions are crucial to
+induce task-specific knowledge to downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023 (main conference, long paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GELU Activation Function in Deep Learning: A Comprehensive Mathematical
+  Analysis and Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minhyeok Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selecting the most suitable activation function is a critical factor in the
+effectiveness of deep learning models, as it influences their learning
+capacity, stability, and computational efficiency. In recent years, the
+Gaussian Error Linear Unit (GELU) activation function has emerged as a dominant
+method, surpassing traditional functions such as the Rectified Linear Unit
+(ReLU) in various applications. This study presents a rigorous mathematical
+investigation of the GELU activation function, exploring its differentiability,
+boundedness, stationarity, and smoothness properties in detail. Additionally,
+we conduct an extensive experimental comparison of the GELU function against a
+broad range of alternative activation functions, utilizing a residual
+convolutional network trained on the CIFAR-10, CIFAR-100, and STL-10 datasets
+as the empirical testbed. Our results demonstrate the superior performance of
+GELU compared to other activation functions, establishing its suitability for a
+wide range of deep learning applications. This comprehensive study contributes
+to a more profound understanding of the underlying mathematical properties of
+GELU and provides valuable insights for practitioners aiming to select
+activation functions that optimally align with their specific objectives and
+constraints in deep learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Achieving Linear Speedup in Decentralized Stochastic Compositional
+  Minimax Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13430v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13430v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongchang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The stochastic compositional minimax problem has attracted a surge of
+attention in recent years since it covers many emerging machine learning
+models. Meanwhile, due to the emergence of distributed data, optimizing this
+kind of problem under the decentralized setting becomes badly needed. However,
+the compositional structure in the loss function brings unique challenges to
+designing efficient decentralized optimization algorithms. In particular, our
+study shows that the standard gossip communication strategy cannot achieve
+linear speedup for decentralized compositional minimax problems due to the
+large consensus error about the inner-level function. To address this issue, we
+developed a novel decentralized stochastic compositional gradient descent
+ascent with momentum algorithm to reduce the consensus error in the inner-level
+function. As such, our theoretical results demonstrate that it is able to
+achieve linear speedup with respect to the number of workers. We believe this
+novel algorithmic design could benefit the development of decentralized
+compositional optimization. Finally, we applied our methods to the imbalanced
+classification problem. The extensive experimental results provide evidence for
+the effectiveness of our algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ L3DMC: Lifelong Learning using Distillation via Mixed-Curvature Space <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16459v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16459v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushik Roy, Peyman Moghadam, Mehrtash Harandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of a lifelong learning (L3) model degrades when it is trained
+on a series of tasks, as the geometrical formation of the embedding space
+changes while learning novel concepts sequentially. The majority of existing L3
+approaches operate on a fixed-curvature (e.g., zero-curvature Euclidean) space
+that is not necessarily suitable for modeling the complex geometric structure
+of data. Furthermore, the distillation strategies apply constraints directly on
+low-dimensional embeddings, discouraging the L3 model from learning new
+concepts by making the model highly stable. To address the problem, we propose
+a distillation strategy named L3DMC that operates on mixed-curvature spaces to
+preserve the already-learned knowledge by modeling and maintaining complex
+geometrical structures. We propose to embed the projected low dimensional
+embedding of fixed-curvature spaces (Euclidean and hyperbolic) to
+higher-dimensional Reproducing Kernel Hilbert Space (RKHS) using a
+positive-definite kernel function to attain rich representation. Afterward, we
+optimize the L3 model by minimizing the discrepancies between the new sample
+representation and the subspace constructed using the old representation in
+RKHS. L3DMC is capable of adapting new knowledge better without forgetting old
+knowledge as it combines the representation power of multiple fixed-curvature
+spaces and is performed on higher-dimensional RKHS. Thorough experiments on
+three benchmarks demonstrate the effectiveness of our proposed distillation
+strategy for medical image classification in L3 settings. Our code
+implementation is publicly available at
+https://github.com/csiro-robotics/L3DMC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023 (Early Accept)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Generalized Likelihood Ratio Test and One-Class Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Ardizzon, Stefano Tomasin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One-class classification (OCC) is the problem of deciding whether an observed
+sample belongs to a target class. We consider the problem of learning an OCC
+model that performs as the generalized likelihood ratio test (GLRT), given a
+dataset containing samples of the target class. The GLRT solves the same
+problem when the statistics of the target class are available. The GLRT is a
+well-known and provably optimal (under specific assumptions) classifier. To
+this end, we consider both the multilayer perceptron neural network (NN) and
+the support vector machine (SVM) models. They are trained as two-class
+classifiers using an artificial dataset for the alternative class, obtained by
+generating random samples, uniformly over the domain of the target-class
+dataset. We prove that, under suitable assumptions, the models converge (with a
+large dataset) to the GLRT. Moreover, we show that the one-class least squares
+SVM (OCLSSVM) with suitable kernels at convergence performs as the GLRT.
+Lastly, we prove that the widely used autoencoder (AE) classifier does not
+generally provide the GLRT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figure, submitted to IEEE Transactions on Signal
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Riemannian Networks for EEG Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10426v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10426v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Wilson, Robin Tibor Schirrmeister, Lukas Alexander Wilhelm Gemein, Tonio Ball
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art performance in electroencephalography (EEG) decoding tasks
+is currently often achieved with either Deep-Learning (DL) or
+Riemannian-Geometry-based decoders (RBDs). Recently, there is growing interest
+in Deep Riemannian Networks (DRNs) possibly combining the advantages of both
+previous classes of methods. However, there are still a range of topics where
+additional insight is needed to pave the way for a more widespread application
+of DRNs in EEG. These include architecture design questions such as network
+size and end-to-end ability.How these factors affect model performance has not
+been explored. Additionally, it is not clear how the data within these networks
+is transformed, and whether this would correlate with traditional EEG decoding.
+Our study aims to lay the groundwork in the area of these topics through the
+analysis of DRNs for EEG with a wide range of hyperparameters. Networks were
+tested on two public EEG datasets and compared with state-of-the-art ConvNets.
+Here we propose end-to-end EEG SPDNet (EE(G)-SPDNet), and we show that this
+wide, end-to-end DRN can outperform the ConvNets, and in doing so use
+physiologically plausible frequency regions. We also show that the end-to-end
+approach learns more complex filters than traditional band-pass filters
+targeting the classical alpha, beta, and gamma frequency bands of the EEG, and
+that performance can benefit from channel specific filtering approaches.
+Additionally, architectural analysis revealed areas for further improvement due
+to the possible loss of Riemannian specific information throughout the network.
+Our study thus shows how to design and train DRNs to infer task-related
+information from the raw EEG without the need of handcrafted filterbanks and
+highlights the potential of end-to-end DRNs such as EE(G)-SPDNet for
+high-performance EEG decoding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 13 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance Issue Identification in Cloud Systems with
+  Relational-Temporal Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenwei Gu, Jinyang Liu, Zhuangbin Chen, Jianping Zhang, Yuxin Su, Jiazhen Gu, Cong Feng, Zengyin Yang, Michael Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance issues permeate large-scale cloud service systems, which can lead
+to huge revenue losses. To ensure reliable performance, it's essential to
+accurately identify and localize these issues using service monitoring metrics.
+Given the complexity and scale of modern cloud systems, this task can be
+challenging and may require extensive expertise and resources beyond the
+capacity of individual humans. Some existing methods tackle this problem by
+analyzing each metric independently to detect anomalies. However, this could
+incur overwhelming alert storms that are difficult for engineers to diagnose
+manually. To pursue better performance, not only the temporal patterns of
+metrics but also the correlation between metrics (i.e., relational patterns)
+should be considered, which can be formulated as a multivariate metrics anomaly
+detection problem. However, most of the studies fall short of extracting these
+two types of features explicitly. Moreover, there exist some unlabeled
+anomalies mixed in the training data, which may hinder the detection
+performance. To address these limitations, we propose the Relational- Temporal
+Anomaly Detection Model (RTAnomaly) that combines the relational and temporal
+information of metrics. RTAnomaly employs a graph attention layer to learn the
+dependencies among metrics, which will further help pinpoint the anomalous
+metrics that may cause the anomaly effectively. In addition, we exploit the
+concept of positive unlabeled learning to address the issue of potential
+anomalies in the training data. To evaluate our method, we conduct experiments
+on a public dataset and two industrial datasets. RTAnomaly outperforms all the
+baseline models by achieving an average F1 score of 0.929 and Hit@3 of 0.920,
+demonstrating its superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Pauli spin blockade using deep learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.00574v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.00574v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Schuff, Dominic T. Lennon, Simon Geyer, David L. Craig, Federico Fedele, Florian Vigneau, Leon C. Camenzind, Andreas V. Kuhlmann, G. Andrew D. Briggs, Dominik M. Zumbühl, Dino Sejdinovic, Natalia Ares
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pauli spin blockade (PSB) can be employed as a great resource for spin qubit
+initialisation and readout even at elevated temperatures but it can be
+difficult to identify. We present a machine learning algorithm capable of
+automatically identifying PSB using charge transport measurements. The scarcity
+of PSB data is circumvented by training the algorithm with simulated data and
+by using cross-device validation. We demonstrate our approach on a silicon
+field-effect transistor device and report an accuracy of 96% on different test
+devices, giving evidence that the approach is robust to device variability. The
+approach is expected to be employable across all types of quantum dot devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Vulnerable Nodes in Urban Infrastructure Interdependent
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09866v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09866v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinzhu Mao, Liu Cao, Chen Gao, Huandong Wang, Hangyu Fan, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and characterizing the vulnerability of urban infrastructures,
+which refers to the engineering facilities essential for the regular running of
+cities and that exist naturally in the form of networks, is of great value to
+us. Potential applications include protecting fragile facilities and designing
+robust topologies, etc. Due to the strong correlation between different
+topological characteristics and infrastructure vulnerability and their
+complicated evolution mechanisms, some heuristic and machine-assisted analysis
+fall short in addressing such a scenario. In this paper, we model the
+interdependent network as a heterogeneous graph and propose a system based on
+graph neural network with reinforcement learning, which can be trained on
+real-world data, to characterize the vulnerability of the city system
+accurately. The presented system leverages deep learning techniques to
+understand and analyze the heterogeneous graph, which enables us to capture the
+risk of cascade failure and discover vulnerable infrastructures of cities.
+Extensive experiments with various requests demonstrate not only the expressive
+power of our system but also transferring ability and necessity of the specific
+components.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subspace Distillation for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushik Roy, Christian Simon, Peyman Moghadam, Mehrtash Harandi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An ultimate objective in continual learning is to preserve knowledge learned
+in preceding tasks while learning new tasks. To mitigate forgetting prior
+knowledge, we propose a novel knowledge distillation technique that takes into
+the account the manifold structure of the latent/output space of a neural
+network in learning novel tasks. To achieve this, we propose to approximate the
+data manifold up-to its first order, hence benefiting from linear subspaces to
+model the structure and maintain the knowledge of a neural network while
+learning novel concepts. We demonstrate that the modeling with subspaces
+provides several intriguing properties, including robustness to noise and
+therefore effective for mitigating Catastrophic Forgetting in continual
+learning. We also discuss and show how our proposed method can be adopted to
+address both classification and segmentation problems. Empirically, we observe
+that our proposed method outperforms various continual learning methods on
+several challenging datasets including Pascal VOC, and Tiny-Imagenet.
+Furthermore, we show how the proposed method can be seamlessly combined with
+existing learning approaches to improve their performances. The codes of this
+article will be available at https://github.com/csiro-robotics/SDCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neural Networks (submitted May 2022, accepted July 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trigger-Level Event Reconstruction for Neutrino Telescopes Using Sparse
+  Submanifold Convolutional Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08812v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08812v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix J. Yu, Jeffrey Lazar, Carlos A. Argüelles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional neural networks (CNNs) have seen extensive applications in
+scientific data analysis, including in neutrino telescopes. However, the data
+from these experiments present numerous challenges to CNNs, such as non-regular
+geometry, sparsity, and high dimensionality. Consequently, CNNs are highly
+inefficient on neutrino telescope data, and require significant pre-processing
+that results in information loss. We propose sparse submanifold convolutions
+(SSCNNs) as a solution to these issues and show that the SSCNN event
+reconstruction performance is comparable to or better than traditional and
+machine learning algorithms. Additionally, our SSCNN runs approximately 16
+times faster than a traditional CNN on a GPU. As a result of this speedup, it
+is expected to be capable of handling the trigger-level event rate of
+IceCube-scale neutrino telescopes. These networks could be used to improve the
+first estimation of the neutrino energy and direction to seed more advanced
+reconstructions, or to provide this information to an alert-sending system to
+quickly follow-up interesting events.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures; changes to training process, adjusted figures,
+  added text</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlpacaFarm: A Simulation Framework for Methods that Learn from Human
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14387v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14387v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yann Dubois, Xuechen Li, Rohan Taori, Tianyi Zhang, Ishaan Gulrajani, Jimmy Ba, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as ChatGPT have seen widespread adoption
+due to their ability to follow user instructions well. Developing these LLMs
+involves a complex yet poorly understood workflow requiring training with human
+feedback. Replicating and understanding this instruction-following process
+faces three major challenges: the high cost of data collection, the lack of
+trustworthy evaluation, and the absence of reference method implementations. We
+address these challenges with AlpacaFarm, a simulator that enables research and
+development for learning from feedback at a low cost. First, we design LLM
+prompts to simulate human feedback that are 45x cheaper than crowdworkers and
+display high agreement with humans. Second, we propose an automatic evaluation
+and validate it against human instructions obtained on real-world interactions.
+Third, we contribute reference implementations for several methods (PPO,
+best-of-n, expert iteration, and more) that learn from pairwise feedback.
+Finally, as an end-to-end validation of AlpacaFarm, we train and evaluate
+eleven models on 10k pairs of real human feedback and show that rankings of
+models trained in AlpacaFarm match rankings of models trained on human data. As
+a demonstration of the research possible in AlpacaFarm, we find that methods
+that use a reward model can substantially improve over supervised fine-tuning
+and that our reference PPO implementation leads to a +10% improvement in
+win-rate against Davinci003. We release all components of AlpacaFarm at
+https://github.com/tatsu-lab/alpaca_farm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Uncertainly Missing and Ambiguous Visual Modality in
+  Multi-Modal Entity Alignment <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Chen, Lingbing Guo, Yin Fang, Yichi Zhang, Jiaoyan Chen, Jeff Z. Pan, Yangning Li, Huajun Chen, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a crucial extension of entity alignment (EA), multi-modal entity alignment
+(MMEA) aims to identify identical entities across disparate knowledge graphs
+(KGs) by exploiting associated visual information. However, existing MMEA
+approaches primarily concentrate on the fusion paradigm of multi-modal entity
+features, while neglecting the challenges presented by the pervasive phenomenon
+of missing and intrinsic ambiguity of visual images. In this paper, we present
+a further analysis of visual modality incompleteness, benchmarking latest MMEA
+models on our proposed dataset MMEA-UMVM, where the types of alignment KGs
+covering bilingual and monolingual, with standard (non-iterative) and iterative
+training paradigms to evaluate the model performance. Our research indicates
+that, in the face of modality incompleteness, models succumb to overfitting the
+modality noise, and exhibit performance oscillations or declines at high rates
+of missing modality. This proves that the inclusion of additional multi-modal
+data can sometimes adversely affect EA. To address these challenges, we
+introduce UMAEA , a robust multi-modal entity alignment approach designed to
+tackle uncertainly missing and ambiguous visual modalities. It consistently
+achieves SOTA performance across all 97 benchmark splits, significantly
+surpassing existing baselines with limited parameters and time consumption,
+while effectively alleviating the identified limitations of other models. Our
+code and benchmark data are available at https://github.com/zjukg/UMAEA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Semantic Web Conference '23 (ISWC 2023),
+  https://github.com/zjukg/UMAEA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AgraSSt: Approximate Graph Stein Statistics for Interpretable Assessment
+  of Implicit Graph Generators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.03673v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.03673v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenkai Xu, Gesine Reinert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose and analyse a novel statistical procedure, coined AgraSSt, to
+assess the quality of graph generators that may not be available in explicit
+form. In particular, AgraSSt can be used to determine whether a learnt graph
+generating process is capable of generating graphs that resemble a given input
+graph. Inspired by Stein operators for random graphs, the key idea of AgraSSt
+is the construction of a kernel discrepancy based on an operator obtained from
+the graph generator. AgraSSt can provide interpretable criticisms for a graph
+generator training procedure and help identify reliable sample batches for
+downstream tasks. Using Stein`s method we give theoretical guarantees for a
+broad class of random graph models. We provide empirical results on both
+synthetic input graphs with known graph generation procedures, and real-world
+input graphs that the state-of-the-art (deep) generative models for graphs are
+trained on.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On student-teacher deviations in distillation: does it pay to disobey? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaishnavh Nagarajan, Aditya Krishna Menon, Srinadh Bhojanapalli, Hossein Mobahi, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) has been widely-used to improve the test accuracy
+of a ``student'' network by training the student to mimic soft probabilities of
+a trained "teacher" network. Yet, it has been shown in recent work that,
+despite being trained to fit the teacher's probabilities, the student not only
+significantly deviates from these probabilities, but also performs even better
+than the teacher. Our work aims to reconcile this seemingly paradoxical
+observation by characterizing the precise nature of the student-teacher
+deviations, and by arguing how they can co-occur with better generalization.
+First, through experiments on image and language data, we identify that these
+deviations correspond to the student systematically exaggerating the confidence
+levels of the teacher. Next, we theoretically and empirically establish in some
+simple settings that KD also exaggerates the implicit bias of gradient descent
+in converging faster along the top eigendirections of the data. Finally, we
+demonstrate that this exaggerated bias effect can simultaneously result in both
+(a) the exaggeration of confidence and (b) the improved generalization of the
+student, thus offering a resolution to the apparent paradox. Our analysis
+brings existing theory and practice closer by considering the role of gradient
+descent in KD and by demonstrating the exaggerated bias effect in both
+theoretical and empirical settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying Momentum-based Positive-definite Submanifold Optimization
+  with Applications to Deep Learning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09738v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09738v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wu Lin, Valentin Duruisseaux, Melvin Leok, Frank Nielsen, Mohammad Emtiyaz Khan, Mark Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Riemannian submanifold optimization with momentum is computationally
+challenging because, to ensure that the iterates remain on the submanifold, we
+often need to solve difficult differential equations. Here, we simplify such
+difficulties for a class of sparse or structured symmetric positive-definite
+matrices with the affine-invariant metric. We do so by proposing a generalized
+version of the Riemannian normal coordinates that dynamically orthonormalizes
+the metric and locally converts the problem into an unconstrained problem in
+the Euclidean space. We use our approach to simplify existing approaches for
+structured covariances and develop matrix-inverse-free $2^\text{nd}$-order
+optimizers for deep learning with low precision by using only matrix
+multiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An updated version of the ICML 2023 paper. Updated the main text to
+  emphasize challenges of using existing Riemannian methods to estimate sparse
+  and structured SPD matrices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Source Separation of Unknown Numbers of Single-Channel Underwater
+  Acoustic Signals Based on Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11749v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11749v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinggang Sun, Kejun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few existing studies focus on the source separation problem with unknown
+numbers of signals, and how to evaluate the performances of the systems is not
+yet clear. We propose a solution with a fixed number of output channels to
+address these two problems, enabling it to avoid the dimensional disaster
+caused by the permutation problem induced by the alignment of outputs to
+targets. Specifically, we propose a two-step algorithm based on autoencoders
+and a new performance evaluation method for situations with mute channels.
+Experiments conducted on simulated mixtures of radiated ship noise show that
+the proposed solution can achieve similar separation performance to that
+attained with a known number of signals. The proposed algorithm achieved
+competitive performance as two algorithms developed for known numbers of
+signals, which is highly explainable and extensible and get the state of the
+art under this framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 1 figure, 3 tables. For codes, see
+  https://github.com/QinggangSUN/unknown_number_source_separation. This work
+  has been submitted to the IEEE for possible publication. Copyright may be
+  transferred without notice, after which this version may no longer be
+  accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Benchmark for the Unknown Detection Capability of Deep Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.00337v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.00337v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyo Kim, Jiin Koo, Sangheum Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have achieved outstanding performance over various
+tasks, but they have a critical issue: over-confident predictions even for
+completely unknown samples. Many studies have been proposed to successfully
+filter out these unknown samples, but they only considered narrow and specific
+tasks, referred to as misclassification detection, open-set recognition, or
+out-of-distribution detection. In this work, we argue that these tasks should
+be treated as fundamentally an identical problem because an ideal model should
+possess detection capability for all those tasks. Therefore, we introduce the
+unknown detection task, an integration of previous individual tasks, for a
+rigorous examination of the detection capability of deep neural networks on a
+wide spectrum of unknown samples. To this end, unified benchmark datasets on
+different scales were constructed and the unknown detection capabilities of
+existing popular methods were subject to comparison. We found that Deep
+Ensemble consistently outperforms the other approaches in detecting unknowns;
+however, all methods are only successful for a specific type of unknown. The
+reproducible code and benchmark datasets are available at
+https://github.com/daintlab/unknown-detection-benchmarks .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ESWA
+  (https://www.sciencedirect.com/science/article/pii/S0957417423009636)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Considerations When Learning Additive Explanations for Black-Box Models <span class="chip">NeurIPS 2018</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1801.08640v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1801.08640v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Tan, Giles Hooker, Paul Koch, Albert Gordo, Rich Caruana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many methods to explain black-box models, whether local or global, are
+additive. In this paper, we study global additive explanations for non-additive
+models, focusing on four explanation methods: partial dependence, Shapley
+explanations adapted to a global setting, distilled additive explanations, and
+gradient-based explanations. We show that different explanation methods
+characterize non-additive components in a black-box model's prediction function
+in different ways. We use the concepts of main and total effects to anchor
+additive explanations, and quantitatively evaluate additive and non-additive
+explanations. Even though distilled explanations are generally the most
+accurate additive explanations, non-additive explanations such as tree
+explanations that explicitly model non-additive components tend to be even more
+accurate. Despite this, our user study showed that machine learning
+practitioners were better able to leverage additive explanations for various
+tasks. These considerations should be taken into account when considering which
+explanation to trust and use to explain black-box models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Machine Learning (2023). Previously titled "Learning
+  Global Additive Explanations for Neural Nets Using Model Distillation". A
+  short version was presented at NeurIPS 2018 Machine Learning for Health
+  Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning-Aided Efficient Decoding of Reed-Muller Subcodes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Vahid Jamali, Xiyang Liu, Ashok Vardhan Makkuva, Hessam Mahdavifar, Sewoong Oh, Pramod Viswanath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reed-Muller (RM) codes achieve the capacity of general binary-input
+memoryless symmetric channels and are conjectured to have a comparable
+performance to that of random codes in terms of scaling laws. However, such
+results are established assuming maximum-likelihood decoders for general code
+parameters. Also, RM codes only admit limited sets of rates. Efficient decoders
+such as successive cancellation list (SCL) decoder and recently-introduced
+recursive projection-aggregation (RPA) decoders are available for RM codes at
+finite lengths. In this paper, we focus on subcodes of RM codes with flexible
+rates. We first extend the RPA decoding algorithm to RM subcodes. To lower the
+complexity of our decoding algorithm, referred to as subRPA, we investigate
+different approaches to prune the projections. Next, we derive the
+soft-decision based version of our algorithm, called soft-subRPA, that not only
+improves upon the performance of subRPA but also enables a differentiable
+decoding algorithm. Building upon the soft-subRPA algorithm, we then provide a
+framework for training a machine learning (ML) model to search for
+\textit{good} sets of projections that minimize the decoding error rate.
+Training our ML model enables achieving very close to the performance of
+full-projection decoding with a significantly smaller number of projections. We
+also show that the choice of the projections in decoding RM subcodes matters
+significantly, and our ML-aided projection pruning scheme is able to find a
+\textit{good} selection, i.e., with negligible performance degradation compared
+to the full-projection case, given a reasonable number of projections.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the Journal on Selected Areas in
+  Information Theory. arXiv admin note: substantial text overlap with
+  arXiv:2102.01671</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DISPEL: Domain Generalization via Domain-Specific Liberating 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07181v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07181v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Yuan Chang, Yu-Neng Chuang, Guanchu Wang, Mengnan Du, Na Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain generalization aims to learn a generalization model that can perform
+well on unseen test domains by only training on limited source domains.
+However, existing domain generalization approaches often bring in
+prediction-irrelevant noise or require the collection of domain labels. To
+address these challenges, we consider the domain generalization problem from a
+different perspective by categorizing underlying feature groups into
+domain-shared and domain-specific features. Nevertheless, the domain-specific
+features are difficult to be identified and distinguished from the input data.
+In this work, we propose DomaIn-SPEcific Liberating (DISPEL), a post-processing
+fine-grained masking approach that can filter out undefined and
+indistinguishable domain-specific features in the embedding space.
+Specifically, DISPEL utilizes a mask generator that produces a unique mask for
+each input data to filter domain-specific features. The DISPEL framework is
+highly flexible to be applied to any fine-tuned models. We derive a
+generalization error bound to guarantee the generalization performance by
+optimizing a designed objective loss. The experimental results on five
+benchmarks demonstrate DISPEL outperforms existing methods and can further
+generalize various algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrated Conditional Estimation-Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.12351v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.12351v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Grigas, Meng Qi, Zuo-Jun Max Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world optimization problems involve uncertain parameters with
+probability distributions that can be estimated using contextual feature
+information. In contrast to the standard approach of first estimating the
+distribution of uncertain parameters and then optimizing the objective based on
+the estimation, we propose an integrated conditional estimation-optimization
+(ICEO) framework that estimates the underlying conditional distribution of the
+random parameter while considering the structure of the optimization problem.
+We directly model the relationship between the conditional distribution of the
+random parameter and the contextual features, and then estimate the
+probabilistic model with an objective that aligns with the downstream
+optimization problem. We show that our ICEO approach is asymptotically
+consistent under moderate regularity conditions and further provide finite
+performance guarantees in the form of generalization bounds. Computationally,
+performing estimation with the ICEO approach is a non-convex and often
+non-differentiable optimization problem. We propose a general methodology for
+approximating the potentially non-differentiable mapping from estimated
+conditional distribution to the optimal decision by a differentiable function,
+which greatly improves the performance of gradient-based algorithms applied to
+the non-convex problem. We also provide a polynomial optimization solution
+approach in the semi-algebraic case. Numerical experiments are also conducted
+to show the empirical success of our approach in different situations including
+with limited data samples and model mismatches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ W-procer: Weighted Prototypical Contrastive Learning for Medical
+  Few-Shot Named Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18624v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18624v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingchen Li, Yang Ye, Jeremy Yeung, Huixue Zhou, Huaiyuan Chu, Rui Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has become a popular solution for few-shot Name Entity
+Recognization (NER). The conventional configuration strives to reduce the
+distance between tokens with the same labels and increase the distance between
+tokens with different labels. The effect of this setup may, however, in the
+medical domain, there are a lot of entities annotated as OUTSIDE (O), and they
+are undesirably pushed apart to other entities that are not labeled as OUTSIDE
+(O) by the current contrastive learning method end up with a noisy prototype
+for the semantic representation of the label, though there are many OUTSIDE (O)
+labeled entities are relevant to the labeled entities. To address this
+challenge, we propose a novel method named Weighted Prototypical Contrastive
+Learning for Medical Few Shot Named Entity Recognization (W-PROCER). Our
+approach primarily revolves around constructing the prototype-based contractive
+loss and weighting network. These components play a crucial role in assisting
+the model in differentiating the negative samples from OUTSIDE (O) tokens and
+enhancing the discrimination ability of contrastive learning. Experimental
+results show that our proposed W-PROCER framework significantly outperforms the
+strong baselines on the three medical benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Output Gaussian Process-Based Data Augmentation for Multi-Building
+  and Multi-Floor Indoor Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.01980v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.01980v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Tang, Sihao Li, Kyeong Soo Kim, Jeremy Smith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Location fingerprinting based on RSSI becomes a mainstream indoor
+localization technique due to its advantage of not requiring the installation
+of new infrastructure and the modification of existing devices, especially
+given the prevalence of Wi-Fi-enabled devices and the ubiquitous Wi-Fi access
+in modern buildings. The use of AI/ML technologies like DNNs makes location
+fingerprinting more accurate and reliable, especially for large-scale
+multi-building and multi-floor indoor localization. The application of DNNs for
+indoor localization, however, depends on a large amount of preprocessed and
+deliberately-labeled data for their training. Considering the difficulty of the
+data collection in an indoor environment, especially under the current epidemic
+situation of COVID-19, we investigate three different methods of RSSI data
+augmentation based on Multi-Output Gaussian Process (MOGP), i.e., by a single
+floor, by neighboring floors, and by a single building; unlike Single-Output
+Gaussian Process (SOGP), MOGP can take into account the correlation among RSSI
+observations from multiple Access Points (APs) deployed closely to each other
+(e.g., APs on the same floor of a building) by collectively handling them. The
+feasibility of the MOGP-based RSSI data augmentation is demonstrated through
+experiments based on the state-of-the-art RNN indoor localization model and the
+UJIIndoorLoc, i.e., the most popular publicly-available multi-building and
+multi-floor indoor localization database, where the RNN model trained with the
+UJIIndoorLoc database augmented by using the whole RSSI data of a building in
+fitting an MOGP model (i.e., by a single building) outperforms the other two
+augmentation methods as well as the RNN model trained with the original
+UJIIndoorLoc database, resulting in the mean three-dimensional positioning
+error of 8.42 m.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Green Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14604v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14604v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashkan Yousefpour, Shen Guo, Ashish Shenoy, Sayan Ghosh, Pierre Stock, Kiwan Maeng, Schalk-Willem Krüger, Michael Rabbat, Carole-Jean Wu, Ilya Mironov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress of AI is fueled by increasingly large and computationally
+intensive machine learning models and datasets. As a consequence, the amount of
+compute used in training state-of-the-art models is exponentially increasing
+(doubling every 10 months between 2015 and 2022), resulting in a large carbon
+footprint. Federated Learning (FL) - a collaborative machine learning technique
+for training a centralized model using data of decentralized entities - can
+also be resource-intensive and have a significant carbon footprint,
+particularly when deployed at scale. Unlike centralized AI that can reliably
+tap into renewables at strategically placed data centers, cross-device FL may
+leverage as many as hundreds of millions of globally distributed end-user
+devices with diverse energy sources. Green AI is a novel and important research
+area where carbon footprint is regarded as an evaluation criterion for AI,
+alongside accuracy, convergence speed, and other metrics. In this paper, we
+propose the concept of Green FL, which involves optimizing FL parameters and
+making design choices to minimize carbon emissions consistent with competitive
+performance and training time. The contributions of this work are two-fold.
+First, we adopt a data-driven approach to quantify the carbon emissions of FL
+by directly measuring real-world at-scale FL tasks running on millions of
+phones. Second, we present challenges, guidelines, and lessons learned from
+studying the trade-off between energy efficiency, performance, and
+time-to-train in a production FL system. Our findings offer valuable insights
+into how FL can reduce its carbon footprint, and they provide a foundation for
+future research in the area of Green AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mining the contribution of intensive care clinical course to outcome
+  after traumatic brain injury 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04630v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04630v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubhayu Bhattacharyay, Pier Francesco Caruso, Cecilia Åkerlund, Lindsay Wilson, Robert D Stevens, David K Menon, Ewout W Steyerberg, David W Nelson, Ari Ercole, the CENTER-TBI investigators/participants
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods to characterise the evolving condition of traumatic brain
+injury (TBI) patients in the intensive care unit (ICU) do not capture the
+context necessary for individualising treatment. Here, we integrate all
+heterogenous data stored in medical records (1,166 pre-ICU and ICU variables)
+to model the individualised contribution of clinical course to six-month
+functional outcome on the Glasgow Outcome Scale - Extended (GOSE). On a
+prospective cohort (n=1,550, 65 centres) of TBI patients, we train recurrent
+neural network models to map a token-embedded time series representation of all
+variables (including missing values) to an ordinal GOSE prognosis every two
+hours. The full range of variables explains up to 52% (95% CI: 50%-54%) of the
+ordinal variance in functional outcome. Up to 91% (95% CI: 90%-91%) of this
+explanation is derived from pre-ICU and admission information (i.e., static
+variables). Information collected in the ICU (i.e., dynamic variables)
+increases explanation (by up to 5% [95% CI: 4%-6%]), though not enough to
+counter poorer overall performance in longer-stay (>5.75 days) patients.
+Highest-contributing variables include physician-based prognoses, CT features,
+and markers of neurological function. Whilst static information currently
+accounts for the majority of functional outcome explanation after TBI,
+data-driven analysis highlights investigative avenues to improve dynamic
+characterisation of longer-stay patients. Moreover, our modelling strategy
+proves useful for converting large patient records into interpretable time
+series with missing data integration and minimal processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Models of human preference for learning reward functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02231v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02231v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        W. Bradley Knox, Stephane Hatgis-Kessell, Serena Booth, Scott Niekum, Peter Stone, Alessandro Allievi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utility of reinforcement learning is limited by the alignment of reward
+functions with the interests of human stakeholders. One promising method for
+alignment is to learn the reward function from human-generated preferences
+between pairs of trajectory segments, a type of reinforcement learning from
+human feedback (RLHF). These human preferences are typically assumed to be
+informed solely by partial return, the sum of rewards along each segment. We
+find this assumption to be flawed and propose modeling human preferences
+instead as informed by each segment's regret, a measure of a segment's
+deviation from optimal decision-making. Given infinitely many preferences
+generated according to regret, we prove that we can identify a reward function
+equivalent to the reward function that generated those preferences, and we
+prove that the previous partial return model lacks this identifiability
+property in multiple contexts. We empirically show that our proposed regret
+preference model outperforms the partial return preference model with finite
+training data in otherwise the same setting. Additionally, we find that our
+proposed regret preference model better predicts real human preferences and
+also learns reward functions from these preferences that lead to policies that
+are better human-aligned. Overall, this work establishes that the choice of
+preference model is impactful, and our proposed regret preference model
+provides an improvement upon a core assumption of recent research. We have open
+sourced our experimental code, the human preferences dataset we gathered, and
+our training and preference elicitation interfaces for gathering a such a
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages (40 pages with references and appendix), 23 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Large-Scale Visual Representation Learning And Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13399v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13399v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eden Dolev, Alaa Awad, Denisa Roberts, Zahra Ebrahimzadeh, Marcin Mejran, Vaibhav Malpani, Mahir Yavuz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently learning visual representations of items is vital for large-scale
+recommendations. In this article we compare several pretrained efficient
+backbone architectures, both in the convolutional neural network (CNN) and in
+the vision transformer (ViT) family. We describe challenges in e-commerce
+vision applications at scale and highlight methods to efficiently train,
+evaluate, and serve visual representations. We present ablation studies
+evaluating visual representations in several downstream tasks. To this end, we
+present a novel multilingual text-to-image generative offline evaluation method
+for visually similar recommendation systems. Finally, we include online results
+from deployed machine learning systems in production on a large scale
+e-commerce platform.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Small Language Models on PubMedQA via Generative Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07804v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07804v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Guo, Peiqi Wang, Yanwei Wang, Shangdi Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made remarkable advancements in the field
+of natural language processing. However, their increasing size poses challenges
+in terms of computational cost. On the other hand, Small Language Models (SLMs)
+are known for their efficiency, but they often struggle with limited capacity
+and training data, especially in specific domains. In this paper, we introduce
+a novel method aimed at improving SLMs in the medical domain using LLM-based
+generative data augmentation. The objective of our approach is to develop more
+efficient and capable models that are specifically tailored for specialized
+applications. Through experiments conducted on the PubMedQA dataset, we
+demonstrate the effectiveness of LLMs in refining and diversifying existing
+question-answer pairs. This refinement process leads to improved performance in
+a significantly smaller model after fine-tuning. Notably, our best SLM, with
+under 1.6 billion parameters, outperforms the few-shot GPT-4 on the PubMedQA
+dataset. Our code and generated data are publicly available to facilitate
+further explorations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simple Steps to Success: Axiomatics of Distance-Based Algorithmic
+  Recourse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15557v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15557v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jenny Hamer, Jake Valladares, Vignesh Viswanathan, Yair Zick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel data-driven framework for algorithmic recourse that offers
+users interventions to change their predicted outcome. Existing approaches to
+compute recourse find a set of points that satisfy some desiderata -- e.g. an
+intervention in the underlying causal graph, or minimizing a cost function.
+Satisfying these criteria, however, requires extensive knowledge of the
+underlying model structure, often an unrealistic amount of information in
+several domains. We propose a data-driven, computationally efficient approach
+to computing algorithmic recourse. We do so by suggesting directions in the
+data manifold that users can take to change their predicted outcome. We present
+Stepwise Explainable Paths (StEP), an axiomatically justified framework to
+compute direction-based algorithmic recourse. We offer a thorough empirical and
+theoretical investigation of StEP. StEP offers provable privacy and robustness
+guarantees, and outperforms the state-of-the-art on several established
+recourse desiderata.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mass-Editing Memory in a <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07229v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07229v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Meng, Arnab Sen Sharma, Alex Andonian, Yonatan Belinkov, David Bau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown exciting promise in updating large language models with
+new memories, so as to replace obsolete information or add specialized
+knowledge. However, this line of work is predominantly limited to updating
+single associations. We develop MEMIT, a method for directly updating a
+language model with many memories, demonstrating experimentally that it can
+scale up to thousands of associations for GPT-J (6B) and GPT-NeoX (20B),
+exceeding prior work by orders of magnitude. Our code and data are at
+https://memit.baulab.info.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures. Code and data at https://memit.baulab.info</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-driven identification and analysis of the glass transition in
+  polymer melts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14220v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14220v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atreyee Banerjee, Hsiao-Ping Hsu, Kurt Kremer, Oleksandra Kukharenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the nature of glass transition, as well as precise estimation
+of the glass transition temperature for polymeric materials, remain open
+questions in both experimental and theoretical polymer sciences. We propose a
+data-driven approach, which utilizes the high-resolution details accessible
+through the molecular dynamics simulation and considers the structural
+information of individual chains. It clearly identifies the glass transition
+temperature of polymer melts of weakly semiflexible chains. By combining
+principal component analysis and clustering, we identify the glass transition
+temperature in the asymptotic limit even from relatively short-time
+trajectories, which just reach into the Rouse-like monomer displacement regime.
+We demonstrate that fluctuations captured by the principal component analysis
+reflect the change in a chain's behaviour: from conformational rearrangement
+above to small rearrangements below the glass transition temperature. Our
+approach is straightforward to apply, and should be applicable to other
+polymeric glass-forming liquids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relation-Aware Distribution Representation Network for Person Clustering
+  with Multiple Modalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijian Liu, Shixiang Tang, Ziyue Li, Zhishuai Li, Lei Bai, Feng Zhu, Rui Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person clustering with multi-modal clues, including faces, bodies, and
+voices, is critical for various tasks, such as movie parsing and identity-based
+movie editing. Related methods such as multi-view clustering mainly project
+multi-modal features into a joint feature space. However, multi-modal clue
+features are usually rather weakly correlated due to the semantic gap from the
+modality-specific uniqueness. As a result, these methods are not suitable for
+person clustering. In this paper, we propose a Relation-Aware Distribution
+representation Network (RAD-Net) to generate a distribution representation for
+multi-modal clues. The distribution representation of a clue is a vector
+consisting of the relation between this clue and all other clues from all
+modalities, thus being modality agnostic and good for person clustering.
+Accordingly, we introduce a graph-based method to construct distribution
+representation and employ a cyclic update policy to refine distribution
+representation progressively. Our method achieves substantial improvements of
++6% and +8.2% in F-score on the Video Person-Clustering Dataset (VPCD) and
+VoxCeleb2 multi-view clustering dataset, respectively. Codes will be released
+publicly upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Transactions on Multimedia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-Aware Talking-Head Video Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Yang, Wei Wang, Jun Ling, Bo Peng, Xu Tan, Jing Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Talking-head video editing aims to efficiently insert, delete, and substitute
+the word of a pre-recorded video through a text transcript editor. The key
+challenge for this task is obtaining an editing model that generates new
+talking-head video clips which simultaneously have accurate lip synchronization
+and motion smoothness. Previous approaches, including 3DMM-based (3D Morphable
+Model) methods and NeRF-based (Neural Radiance Field) methods, are sub-optimal
+in that they either require minutes of source videos and days of training time
+or lack the disentangled control of verbal (e.g., lip motion) and non-verbal
+(e.g., head pose and expression) representations for video clip insertion. In
+this work, we fully utilize the video context to design a novel framework for
+talking-head video editing, which achieves efficiency, disentangled motion
+control, and sequential smoothness. Specifically, we decompose this framework
+to motion prediction and motion-conditioned rendering: (1) We first design an
+animation prediction module that efficiently obtains smooth and lip-sync motion
+sequences conditioned on the driven speech. This module adopts a
+non-autoregressive network to obtain context prior and improve the prediction
+efficiency, and it learns a speech-animation mapping prior with better
+generalization to novel speech from a multi-identity video dataset. (2) We then
+introduce a neural rendering module to synthesize the photo-realistic and
+full-head video frames given the predicted motion sequence. This module adopts
+a pre-trained head topology and uses only few frames for efficient fine-tuning
+to obtain a person-specific rendering model. Extensive experiments demonstrate
+that our method efficiently achieves smoother editing results with higher image
+quality and lip accuracy using less data than previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted In Proceedings of the 31st ACM International Conference on
+  Multimedia (MM' 23),</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoPro: A Visual Analytics Approach for Interactive Video Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianben He, Xingbo Wang, Kam Kwai Wong, Xijie Huang, Changjian Chen, Zixin Chen, Fengjie Wang, Min Zhu, Huamin Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constructing supervised machine learning models for real-world video analysis
+require substantial labeled data, which is costly to acquire due to scarce
+domain expertise and laborious manual inspection. While data programming shows
+promise in generating labeled data at scale with user-defined labeling
+functions, the high dimensional and complex temporal information in videos
+poses additional challenges for effectively composing and evaluating labeling
+functions. In this paper, we propose VideoPro, a visual analytics approach to
+support flexible and scalable video data programming for model steering with
+reduced human effort. We first extract human-understandable events from videos
+using computer vision techniques and treat them as atomic components of
+labeling functions. We further propose a two-stage template mining algorithm
+that characterizes the sequential patterns of these events to serve as labeling
+function templates for efficient data labeling. The visual interface of
+VideoPro facilitates multifaceted exploration, examination, and application of
+the labeling templates, allowing for effective programming of video data at
+scale. Moreover, users can monitor the impact of programming on model
+performance and make informed adjustments during the iterative programming
+process. We demonstrate the efficiency and effectiveness of our approach with
+two case studies and expert interviews.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZRIGF: An Innovative Multimodal Framework for Zero-Resource
+  Image-Grounded Dialogue Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhang, Jian Wang, Hui Ma, Bo Xu, Hongfei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-grounded dialogue systems benefit greatly from integrating visual
+information, resulting in high-quality response generation. However, current
+models struggle to effectively utilize such information in zero-resource
+scenarios, mainly due to the disparity between image and text modalities. To
+overcome this challenge, we propose an innovative multimodal framework, called
+ZRIGF, which assimilates image-grounded information for dialogue generation in
+zero-resource situations. ZRIGF implements a two-stage learning strategy,
+comprising contrastive pre-training and generative pre-training. Contrastive
+pre-training includes a text-image matching module that maps images and texts
+into a unified encoded vector space, along with a text-assisted masked image
+modeling module that preserves pre-training visual features and fosters further
+multimodal feature alignment. Generative pre-training employs a multimodal
+fusion module and an information transfer module to produce insightful
+responses based on harmonized multimodal representations. Comprehensive
+experiments conducted on both text-based and image-grounded dialogue datasets
+demonstrate ZRIGF's efficacy in generating contextually pertinent and
+informative responses. Furthermore, we adopt a fully zero-resource scenario in
+the image-grounded dialogue dataset to demonstrate our framework's robust
+generalization capabilities in novel domains. The code is available at
+https://github.com/zhangbo-nlp/ZRIGF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023 Accpeted, Repo:
+  https://github.com/zhangbo-nlp/ZRIGF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modality Multi-Loss Fusion Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehui Wu, Ziwei Gong, Jaywon Koo, Julia Hirschberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we investigate the optimal selection and fusion of features
+across multiple modalities and combine these in a neural network to improve
+emotion detection. We compare different fusion methods and examine the impact
+of multi-loss training within the multi-modality fusion network, identifying
+useful findings relating to subnet performance. Our best model achieves
+state-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and
+CH-SIMS), and outperforms the other methods in most metrics. We have found that
+training on multimodal features improves single modality testing and designing
+fusion methods based on dataset annotation schema enhances model performance.
+These results suggest a roadmap towards an optimized feature selection and
+fusion approach for enhancing emotion detection in neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally to the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accessibility and Inclusiveness of New Information and Communication
+  Technologies for Disabled Users and Content Creators in the Metaverse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dr Petar Radanliev, Professor David De Roure, Dr Peter Novitzky, Dr Ivo Sluganovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the proliferation of Blockchain Metaverse projects, the inclusion of
+physically disabled individuals in the Metaverse remains distant, with limited
+standards and regulations in place. However, the article proposes a concept of
+the Metaverse that leverages emerging technologies, such as Virtual and
+Augmented Reality, and the Internet of Things, to enable greater engagement of
+disabled creatives. This approach aims to enhance inclusiveness in the
+Metaverse landscape. Based on the findings, the paper concludes that the active
+involvement of physically disabled individuals in the design and development of
+Metaverse platforms is crucial for promoting inclusivity. The proposed
+framework for accessibility and inclusiveness in Virtual, Augmented, and Mixed
+realities of decentralised Metaverses provides a basis for the meaningful
+participation of disabled creatives. The article emphasises the importance of
+addressing the mechanisms for art production by individuals with disabilities
+in the emerging Metaverse landscape. Additionally, it highlights the need for
+further research and collaboration to establish standards and regulations that
+facilitate the inclusion of physically disabled individuals in Metaverse
+projects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAiVAR-T: Multimodal Audio-image and Video Action Recognizer using
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03741v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03741v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Bilal Shaikh, Douglas Chai, Syed Mohammed Shamsul Islam, Naveed Akhtar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In line with the human capacity to perceive the world by simultaneously
+processing and integrating high-dimensional inputs from multiple modalities
+like vision and audio, we propose a novel model, MAiVAR-T (Multimodal
+Audio-Image to Video Action Recognition Transformer). This model employs an
+intuitive approach for the combination of audio-image and video modalities,
+with a primary aim to escalate the effectiveness of multimodal human action
+recognition (MHAR). At the core of MAiVAR-T lies the significance of distilling
+substantial representations from the audio modality and transmuting these into
+the image domain. Subsequently, this audio-image depiction is fused with the
+video modality to formulate a unified representation. This concerted approach
+strives to exploit the contextual richness inherent in both audio and video
+modalities, thereby promoting action recognition. In contrast to existing
+state-of-the-art strategies that focus solely on audio or video modalities,
+MAiVAR-T demonstrates superior performance. Our extensive empirical evaluations
+conducted on a benchmark action recognition dataset corroborate the model's
+remarkable performance. This underscores the potential enhancements derived
+from integrating audio and video modalities for action recognition purposes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, 4 tables, Peer reviewed, Accepted @ The 11th
+  European Workshop on Visual Information Processing (EUVIP) will be held on
+  11th-14th September 2023, in Gj{\o}vik, Norway. arXiv admin note: text
+  overlap with arXiv:2103.15691 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SepMark: Deep Separable Watermarking for Unified Source Tracing and
+  Deepfake Detection <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoshuai Wu, Xin Liao, Bo Ou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Malicious Deepfakes have led to a sharp conflict over distinguishing between
+genuine and forged faces. Although many countermeasures have been developed to
+detect Deepfakes ex-post, undoubtedly, passive forensics has not considered any
+preventive measures for the pristine face before foreseeable manipulations. To
+complete this forensics ecosystem, we thus put forward the proactive solution
+dubbed SepMark, which provides a unified framework for source tracing and
+Deepfake detection. SepMark originates from encoder-decoder-based deep
+watermarking but with two separable decoders. For the first time the deep
+separable watermarking, SepMark brings a new paradigm to the established study
+of deep watermarking, where a single encoder embeds one watermark elegantly,
+while two decoders can extract the watermark separately at different levels of
+robustness. The robust decoder termed Tracer that resists various distortions
+may have an overly high level of robustness, allowing the watermark to survive
+both before and after Deepfake. The semi-robust one termed Detector is
+selectively sensitive to malicious distortions, making the watermark disappear
+after Deepfake. Only SepMark comprising of Tracer and Detector can reliably
+trace the trusted source of the marked face and detect whether it has been
+altered since being marked; neither of the two alone can achieve this.
+Extensive experiments demonstrate the effectiveness of the proposed SepMark on
+typical Deepfakes, including face swapping, expression reenactment, and
+attribute editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Multimodal Knowledge Graph Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Ningyu Zhang, Jintian Zhang, Xiaohan Wang, Tongtong Wu, Xi Chen, Yongheng Wang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Knowledge Graph Construction (MKGC) involves creating structured
+representations of entities and relations using multiple modalities, such as
+text and images. However, existing MKGC models face challenges in handling the
+addition of new entities and relations in dynamic real-world scenarios. The
+current continual setting for knowledge graph construction mainly focuses on
+entity and relation extraction from text data, overlooking other multimodal
+sources. Therefore, there arises the need to explore the challenge of continual
+MKGC to address the phenomenon of catastrophic forgetting and ensure the
+retention of past knowledge extracted from different forms of data. This
+research focuses on investigating this complex topic by developing lifelong
+MKGC benchmark datasets. Based on the empirical findings that several typical
+MKGC models, when trained on multimedia data, might unexpectedly underperform
+compared to those solely utilizing textual resources in a continual setting, we
+propose a Lifelong MultiModal Consistent Transformer Framework (LMC) for
+continual MKGC, which plays the strengths of the consistent multimodal
+optimization in continual learning and leads to a better stability-plasticity
+trade-off. Our experiments demonstrate the superior performance of our method
+over prevailing continual learning techniques or multimodal approaches in
+dynamic scenarios. Code and datasets can be found at
+https://github.com/zjunlp/ContinueMKGC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Multimodal Sentiment Analysis based on Multimodal Probabilistic
+  Fusion <span class="highlight-title">Prompt</span>s <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.06607v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.06607v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocui Yang, Shi Feng, Daling Wang, Pengfei Hong, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal sentiment analysis has gained significant attention due to the
+proliferation of multimodal content on social media. However, existing studies
+in this area rely heavily on large-scale supervised data, which is
+time-consuming and labor-intensive to collect. Thus, there is a need to address
+the challenge of few-shot multimodal sentiment analysis. To tackle this
+problem, we propose a novel method called Multimodal Probabilistic Fusion
+Prompts (MultiPoint) that leverages diverse cues from different modalities for
+multimodal sentiment detection in the few-shot scenario. Specifically, we start
+by introducing a Consistently Distributed Sampling approach called CDS, which
+ensures that the few-shot dataset has the same category distribution as the
+full dataset. Unlike previous approaches primarily using prompts based on the
+text modality, we design unified multimodal prompts to reduce discrepancies
+between different modalities and dynamically incorporate multimodal
+demonstrations into the context of each multimodal instance. To enhance the
+model's robustness, we introduce a probabilistic fusion method to fuse output
+predictions from multiple diverse prompts for each input. Our extensive
+experiments on six datasets demonstrate the effectiveness of our approach.
+First, our method outperforms strong baselines in the multimodal few-shot
+setting. Furthermore, under the same amount of data (1% of the full dataset),
+our CDS-based experimental results significantly outperform those based on
+previously sampled datasets constructed from the same number of instances of
+each class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures, 7 tables. It has been accepted ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Uncertainly Missing and Ambiguous Visual Modality in
+  Multi-Modal Entity Alignment <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuo Chen, Lingbing Guo, Yin Fang, Yichi Zhang, Jiaoyan Chen, Jeff Z. Pan, Yangning Li, Huajun Chen, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a crucial extension of entity alignment (EA), multi-modal entity alignment
+(MMEA) aims to identify identical entities across disparate knowledge graphs
+(KGs) by exploiting associated visual information. However, existing MMEA
+approaches primarily concentrate on the fusion paradigm of multi-modal entity
+features, while neglecting the challenges presented by the pervasive phenomenon
+of missing and intrinsic ambiguity of visual images. In this paper, we present
+a further analysis of visual modality incompleteness, benchmarking latest MMEA
+models on our proposed dataset MMEA-UMVM, where the types of alignment KGs
+covering bilingual and monolingual, with standard (non-iterative) and iterative
+training paradigms to evaluate the model performance. Our research indicates
+that, in the face of modality incompleteness, models succumb to overfitting the
+modality noise, and exhibit performance oscillations or declines at high rates
+of missing modality. This proves that the inclusion of additional multi-modal
+data can sometimes adversely affect EA. To address these challenges, we
+introduce UMAEA , a robust multi-modal entity alignment approach designed to
+tackle uncertainly missing and ambiguous visual modalities. It consistently
+achieves SOTA performance across all 97 benchmark splits, significantly
+surpassing existing baselines with limited parameters and time consumption,
+while effectively alleviating the identified limitations of other models. Our
+code and benchmark data are available at https://github.com/zjukg/UMAEA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Semantic Web Conference '23 (ISWC 2023),
+  https://github.com/zjukg/UMAEA</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2023-08-09T05:21:12.050179517Z">
+            2023-08-09 05:21:12 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`